|
1 |
| -#!/usr/bin/env python3.6 |
2 |
| -# coding=utf-8 |
| 1 | +# !/usr/bin/python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +# 1. 导入线程池模块 |
| 5 | +# 线程池 |
| 6 | +import gevent.monkey |
| 7 | +gevent.monkey.patch_all() |
| 8 | +from gevent.pool import Pool |
| 9 | +from queue import Queue |
3 | 10 | import requests
|
4 | 11 | from lxml import etree
|
5 |
| -import os |
6 |
| -import sys |
7 |
| - |
8 |
| -""" |
9 |
| -info: |
10 |
| -author:CriseLYJ |
11 |
| -github:https://github.com/CriseLYJ/ |
12 |
| -update_time:2019-3-9 |
13 |
| -""" |
14 |
| -page_init = "https://www.qiushibaike.com/text/" |
15 |
| -joke_init = "https://www.qiushibaike.com/article/" |
16 |
| -SAVE_PATH = os.path.join(os.getcwd(), 'jokes/') |
17 |
| - |
18 |
| - |
19 |
| -class Spider(object): |
20 |
| - def __init__(self, page_num): |
21 |
| - self.page_num = int(page_num) |
22 |
| - # 第一页特殊处理 |
23 |
| - self.page_urls = ["https://www.qiushibaike.com/text/"] |
24 |
| - # page_url -> joke_url |
25 |
| - self.joke_urls = [] |
26 |
| - # joke_url -> joke_name joke_content |
27 |
| - # dict name : content |
28 |
| - self.joke_content = "" |
29 |
| - self.joke_id = 1; |
30 |
| - try: |
31 |
| - os.mkdir(SAVE_PATH) |
32 |
| - except Exception as e: |
33 |
| - print(e) |
34 |
| - |
35 |
| - def get_page_urls(self): |
36 |
| - if self.page_num > 1: |
37 |
| - # 通过遍历获取到链接 |
38 |
| - for n in range(2, self.page_num + 1): |
39 |
| - page_url = page_init + 'page/' + str(n) + '/' |
40 |
| - self.page_urls.append(page_url) |
41 |
| - |
42 |
| - def get_joke_urls(self): |
43 |
| - for page_url in self.page_urls: |
44 |
| - html = requests.get(page_url).content |
45 |
| - selector = etree.HTML(html) |
46 |
| - qiushi_id = selector.xpath('/html/body/div[@id="content"]/div/div[@id="content-left"]/div/@id') |
47 |
| - for q_id in qiushi_id: |
48 |
| - id = q_id.split('_')[2] |
49 |
| - joke_url = joke_init + id + '/' |
50 |
| - print(joke_url) |
51 |
| - self.joke_urls.append(joke_url) |
52 |
| - |
53 |
| - def get_joke(self): |
54 |
| - for joke_url in self.joke_urls: |
55 |
| - html = requests.get(joke_url).content |
56 |
| - selector = etree.HTML(html) |
57 |
| - one_joke = selector.xpath('//div[@class="word"]/div/text()') |
58 |
| - self.joke_content = "" |
59 |
| - for words in one_joke: |
60 |
| - self.joke_content += words + '\n' |
61 |
| - self.download() |
62 |
| - |
63 |
| - def download(self): |
64 |
| - joke_path = SAVE_PATH + str(self.joke_id) + '.txt' |
65 |
| - self.joke_id += 1 |
66 |
| - # 笑话路径 |
67 |
| - print(joke_path) |
68 |
| - with open(joke_path, "w") as f: |
69 |
| - f.write(self.joke_content) |
70 |
| - |
71 |
| - def start(self): |
72 |
| - # 获取主页url |
73 |
| - self.get_page_urls() |
74 |
| - # 获取笑话链接 |
75 |
| - self.get_joke_urls() |
76 |
| - # 获取笑话 |
77 |
| - self.get_joke() |
78 |
| - # 调用下载接口 |
79 |
| - self.download() |
80 | 12 |
|
| 13 | +class QiushiSpider(): |
81 | 14 |
|
82 |
| -if __name__ == '__main__': |
83 |
| - # 获取账号 |
84 |
| - page_num = input('请告诉我:你想获取多少页的糗事?') |
| 15 | + def __init__(self, max_page): |
| 16 | + self.max_page = max_page |
| 17 | + # 2. 创建线程池,初始化线程数量 |
| 18 | + self.pool = Pool(5) |
| 19 | + |
| 20 | + self.base_url = "http://www.qiushibaike.com/8hr/page/{}/" |
| 21 | + self.headers = { |
| 22 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" |
| 23 | + } |
| 24 | + |
| 25 | + # 专门存放 url 容器 |
| 26 | + self.url_queue = Queue() |
| 27 | + pass |
| 28 | + |
| 29 | + def get_url_list(self): |
| 30 | + ''' |
| 31 | + 获取 url 列表放入到 url 容器中 |
| 32 | + :return: |
| 33 | + ''' |
| 34 | + for page in range(1,self.max_page,1): |
| 35 | + url = self.base_url.format(page) |
| 36 | + self.url_queue.put(url) |
| 37 | + |
| 38 | + # 3. 实现执行任务 |
| 39 | + def exec_task(self): |
| 40 | + # 1> 获取url |
| 41 | + url = self.url_queue.get() |
| 42 | + |
| 43 | + # 2> 发送请求获取 html |
| 44 | + response = requests.get(url,headers=self.headers) |
| 45 | + html = response.text |
| 46 | + |
| 47 | + # 3> 解析 html 提取数据 |
| 48 | + eroot = etree.HTML(html) |
| 49 | + |
| 50 | + titles = eroot.xpath('//a[@class="recmd-content"]/text()') |
| 51 | + for title in titles: |
| 52 | + item = {} |
| 53 | + item["title"] = title |
85 | 54 |
|
86 |
| - qb = Spider(page_num) |
87 |
| - # 启动爬虫程序 |
88 |
| - qb.start() |
| 55 | + # 4> 保存数据 |
| 56 | + print(item) |
| 57 | + self.url_queue.task_done() |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | + # 4. 实现执行任务完成后的操作,必须至少有一个参数 |
| 62 | + # result 任务执行的最终结果的返回值 |
| 63 | + def exec_task_finished(self,result): |
| 64 | + print("result:",result) |
| 65 | + print("执行任务完成") |
| 66 | + self.pool.apply_async(self.exec_task,callback=self.exec_task_finished) |
| 67 | + |
| 68 | + |
| 69 | + def run(self): |
| 70 | + |
| 71 | + self.get_url_list() |
| 72 | + |
| 73 | + # 5. 让任务使用线程池中的线程执行并且设置执行后的回调操作 |
| 74 | + # callback 表示执行完成后的回调 |
| 75 | + for i in range(5): |
| 76 | + self.pool.apply_async(self.exec_task,callback=self.exec_task_finished) |
| 77 | + self.url_queue.join() |
| 78 | + pass |
| 79 | + |
| 80 | +if __name__ == '__main__': |
| 81 | + max_page = input("请输入您需要多少页内容:") |
| 82 | + spider = QiushiSpider(int(max_page)) |
| 83 | + spider.run() |
0 commit comments