当前位置：首页 > news >正文

第三次作业

news 2026/6/11 2:18:26

作业①

气象爬取

实验要求

要求：指定一个网站，爬取这个网站中的所有的所有图片，例如中国气象网（http://www.weather.com.cn）。实现单线程和多线程的方式爬取。

核心代码和运行结果

点击查看代码

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor  target_url = "http://www.weather.com.cn"
headers = {'User-Agent': 'Mozilla/5.0'}def get_image_links(url):print(f"正在分析网页: {url} ...")try:resp = requests.get(url, headers=headers, timeout=10)soup = BeautifulSoup(resp.content, 'html.parser')links = []for img in soup.find_all('img'):src = img.get('src') or img.get('data-src')if src:full_url = urljoin(url, src)links.append(full_url)set() return list(set(links))except Exception as e:print(f"网页打开失败: {e}")return []def download_one(img_url, index, save_folder):try:# 文件名: pic_1.jpg, pic_2.jpg ...filename = f"pic_{index}.jpg"filepath = os.path.join(save_folder, filename)if os.path.exists(filepath):print(f"跳过(已存在): {filename} \n   -> 来源: {img_url}")returndata = requests.get(img_url, headers=headers, timeout=5).contentwith open(filepath, 'wb') as f:f.write(data)print(f"√ 下载成功: {filename} \n   -> 来源: {img_url}")except Exception as e:print(f"× 下载失败: {img_url} -> {e}")def run_single_thread():print("\n" + "=" * 40)print("单线程下载")print("=" * 40)save_folder = "images_single"if not os.path.exists(save_folder): os.makedirs(save_folder)links = get_image_links(target_url)print(f"找到 {len(links)} 张图片，开始排队下载...\n")# 一个循环，必须等上一张下完，才能下下一张for i, link in enumerate(links):download_one(link, i + 1, save_folder)print("\n单线程下载全部完成")def run_multi_thread():print("\n" + "=" * 40)print("多线程下载 ")print("=" * 40)save_folder = "images_multi"if not os.path.exists(save_folder): os.makedirs(save_folder)links = get_image_links(target_url)print(f"找到 {len(links)} 张图片，启动 10 个线程同时下载...\n")# 使用线程池，max_workers=10 表示同时派10个人去下载with ThreadPoolExecutor(max_workers=10) as executor:for i, link in enumerate(links):executor.submit(download_one, link, i + 1, save_folder)print("\n多线程下载指令已全部发送")# === 主程序 ===
if __name__ == "__main__":# 1. 运行单线程版 (结果保存在 images_single 文件夹)run_single_thread()# 2. 运行多线程版 (结果保存在 images_multi 文件夹)run_multi_thread()

心得体会

我利用 requests 获取网页源码并用 BeautifulSoup 解析提取图片链接，通过复用同一个下载函数，分别进行单线程顺序执行和基于 ThreadPoolExecutor 的多线程并发下载。

作业②

股票信息爬取

实验要求

用scrapy和xpath爬取股票信息

核心代码和实验结果

点击查看代码

import scrapy
from datetime import datetime
from stock_spider.items import StockItem
import jsonclass EastmoneySpider(scrapy.Spider):name = "eastmoney"allowed_domains = ["eastmoney.com"]base_url = 'http://80.push2.eastmoney.com/api/qt/clist/get'def start_requests(self):params = {'pn': 1,'pz': 100,'po': 1,'np': 1,'fltt': 2,'invt': 2,'fid': 'f3','fs': 'm:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23','fields': 'f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18'}url = self.base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()])yield scrapy.Request(url, callback=self.parse)def parse(self, response):try:data = json.loads(response.text)stocks = data.get('data', {}).get('diff', [])for index, stock in enumerate(stocks, start=1):item = StockItem()item['sequence_id'] = indexitem['stock_code'] = stock.get('f12', '')item['stock_name'] = stock.get('f14', '')item['latest_price'] = float(stock.get('f2', 0))item['change_percent'] = f"{stock.get('f3', 0):.2f}%"item['change_amount'] = float(stock.get('f4', 0))item['volume'] = f"{float(stock.get('f5', 0) / 10000):.2f}万"item['turnover'] = f"{float(stock.get('f6', 0) / 100000000):.2f}亿"item['amplitude'] = f"{stock.get('f7', 0):.2f}%"item['highest'] = float(stock.get('f15', 0))item['lowest'] = float(stock.get('f16', 0))item['today_open'] = float(stock.get('f17', 0))item['yesterday_close'] = float(stock.get('f18', 0))item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')yield itemexcept Exception as e:self.logger.error(f"✗ 数据解析失败: {e}")

心得体会

我用 Scrapy 框架绕过前端页面，直接向东方财富网的后台 JSON API 接口发送请求，通过解析返回的 JSON 数据并将特定的键名比如f12, f14映射为股票信息来实现数据抓取。刚开始创建scrapy和配置管道文件时遇到困难，通过询问大模型和查询来解决。

作业③

爬取外汇网站数

实验要求

用scrapy+xpath+mysql爬取外汇网站数

核心代码和实验结果

点击查看代码

import scrapy
from scrapy_project.items import CurrencyItem
from datetime import datetimeclass CurrencySpider(scrapy.Spider):name = "currency"allowed_domains = ["boc.cn"]start_urls = ["https://www.boc.cn/sourcedb/whpj/"]def parse(self, response):rows = response.xpath('//table[@align="left" and @width="100%"]//tr')update_time = Nonetime_xpath_options = ['//div[@align="right"]//text()','//p[contains(text(), "更新时间")]//text()','//font[contains(text(), "更新时间")]//text()','//td[contains(text(), "更新时间")]//text()','//*[contains(text(), "更新时间")]//text()',]for xpath in time_xpath_options:time_texts = response.xpath(xpath).getall()for text in time_texts:if '更新时间' in text or ':' in text:update_time = text.strip()self.logger.info(f"找到更新时间: {update_time}")breakif update_time:breakif not update_time:update_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')self.logger.warning(f"未找到更新时间，使用当前时间: {update_time}")self.logger.info(f"找到 {len(rows)} 行数据")for i, row in enumerate(rows):if i == 0:continuecells = row.xpath('.//td/text()').getall()cells = [cell.strip() for cell in cells if cell.strip()]if len(cells) < 6:self.logger.warning(f"第 {i} 行数据不完整: {cells}")continuetry:item = CurrencyItem()item['currency'] = cells[0]  # 货币名称item['tbp'] = self.parse_float(cells[1])  # 现汇买入价item['cbp'] = self.parse_float(cells[2])  # 现钞买入价item['tsp'] = self.parse_float(cells[3])  # 现汇卖出价item['csp'] = self.parse_float(cells[4])  # 现钞卖出价item['time'] = update_timeyield itemexcept Exception as e:self.logger.error(f"解析第 {i} 行时出错: {e}, 数据: {cells}")def parse_float(self, value):try:value = value.strip()if not value or value == '--':return 0.0return float(value)except (ValueError, AttributeError):return 0.0

心得体会

我使用 Scrapy 框架访问中国银行外汇牌价网页，采用多种 XPath定位页面更新时间，并遍历解析 HTML 表格行来提取和清洗各种货币的现汇/现钞买入卖出价数据，最终生成结构化的 CurrencyItem 对象。其中在获取更新时间是遇到的困难，无法读取时间，最后依次尝试定位特定布局属性、标签内的关键词），最后使用全页面通配符搜索，以保证能读取到时间

查看全文

http://www.zskr.cn/news/74656.html