Skip to content

Commit 1c7d3fd

Browse files
committed
对糗事百科进行了重构-协程池(高性能)
1 parent 7e34b2e commit 1c7d3fd

File tree

1 file changed

+78
-83
lines changed

1 file changed

+78
-83
lines changed

qsbk/qiushibaike.py

+78-83
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,83 @@
1-
#!/usr/bin/env python3.6
2-
# coding=utf-8
1+
# !/usr/bin/python3
2+
# -*- coding: utf-8 -*-
3+
4+
# 1. 导入线程池模块
5+
# 线程池
6+
import gevent.monkey
7+
gevent.monkey.patch_all()
8+
from gevent.pool import Pool
9+
from queue import Queue
310
import requests
411
from lxml import etree
5-
import os
6-
import sys
7-
8-
"""
9-
info:
10-
author:CriseLYJ
11-
github:https://github.com/CriseLYJ/
12-
update_time:2019-3-9
13-
"""
14-
page_init = "https://www.qiushibaike.com/text/"
15-
joke_init = "https://www.qiushibaike.com/article/"
16-
SAVE_PATH = os.path.join(os.getcwd(), 'jokes/')
17-
18-
19-
class Spider(object):
20-
def __init__(self, page_num):
21-
self.page_num = int(page_num)
22-
# 第一页特殊处理
23-
self.page_urls = ["https://www.qiushibaike.com/text/"]
24-
# page_url -> joke_url
25-
self.joke_urls = []
26-
# joke_url -> joke_name joke_content
27-
# dict name : content
28-
self.joke_content = ""
29-
self.joke_id = 1;
30-
try:
31-
os.mkdir(SAVE_PATH)
32-
except Exception as e:
33-
print(e)
34-
35-
def get_page_urls(self):
36-
if self.page_num > 1:
37-
# 通过遍历获取到链接
38-
for n in range(2, self.page_num + 1):
39-
page_url = page_init + 'page/' + str(n) + '/'
40-
self.page_urls.append(page_url)
41-
42-
def get_joke_urls(self):
43-
for page_url in self.page_urls:
44-
html = requests.get(page_url).content
45-
selector = etree.HTML(html)
46-
qiushi_id = selector.xpath('/html/body/div[@id="content"]/div/div[@id="content-left"]/div/@id')
47-
for q_id in qiushi_id:
48-
id = q_id.split('_')[2]
49-
joke_url = joke_init + id + '/'
50-
print(joke_url)
51-
self.joke_urls.append(joke_url)
52-
53-
def get_joke(self):
54-
for joke_url in self.joke_urls:
55-
html = requests.get(joke_url).content
56-
selector = etree.HTML(html)
57-
one_joke = selector.xpath('//div[@class="word"]/div/text()')
58-
self.joke_content = ""
59-
for words in one_joke:
60-
self.joke_content += words + '\n'
61-
self.download()
62-
63-
def download(self):
64-
joke_path = SAVE_PATH + str(self.joke_id) + '.txt'
65-
self.joke_id += 1
66-
# 笑话路径
67-
print(joke_path)
68-
with open(joke_path, "w") as f:
69-
f.write(self.joke_content)
70-
71-
def start(self):
72-
# 获取主页url
73-
self.get_page_urls()
74-
# 获取笑话链接
75-
self.get_joke_urls()
76-
# 获取笑话
77-
self.get_joke()
78-
# 调用下载接口
79-
self.download()
8012

13+
class QiushiSpider():
8114

82-
if __name__ == '__main__':
83-
# 获取账号
84-
page_num = input('请告诉我:你想获取多少页的糗事?')
15+
def __init__(self, max_page):
16+
self.max_page = max_page
17+
# 2. 创建线程池,初始化线程数量
18+
self.pool = Pool(5)
19+
20+
self.base_url = "http://www.qiushibaike.com/8hr/page/{}/"
21+
self.headers = {
22+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
23+
}
24+
25+
# 专门存放 url 容器
26+
self.url_queue = Queue()
27+
pass
28+
29+
def get_url_list(self):
30+
'''
31+
获取 url 列表放入到 url 容器中
32+
:return:
33+
'''
34+
for page in range(1,self.max_page,1):
35+
url = self.base_url.format(page)
36+
self.url_queue.put(url)
37+
38+
# 3. 实现执行任务
39+
def exec_task(self):
40+
# 1> 获取url
41+
url = self.url_queue.get()
42+
43+
# 2> 发送请求获取 html
44+
response = requests.get(url,headers=self.headers)
45+
html = response.text
46+
47+
# 3> 解析 html 提取数据
48+
eroot = etree.HTML(html)
49+
50+
titles = eroot.xpath('//a[@class="recmd-content"]/text()')
51+
for title in titles:
52+
item = {}
53+
item["title"] = title
8554

86-
qb = Spider(page_num)
87-
# 启动爬虫程序
88-
qb.start()
55+
# 4> 保存数据
56+
print(item)
57+
self.url_queue.task_done()
58+
59+
60+
61+
# 4. 实现执行任务完成后的操作,必须至少有一个参数
62+
# result 任务执行的最终结果的返回值
63+
def exec_task_finished(self,result):
64+
print("result:",result)
65+
print("执行任务完成")
66+
self.pool.apply_async(self.exec_task,callback=self.exec_task_finished)
67+
68+
69+
def run(self):
70+
71+
self.get_url_list()
72+
73+
# 5. 让任务使用线程池中的线程执行并且设置执行后的回调操作
74+
# callback 表示执行完成后的回调
75+
for i in range(5):
76+
self.pool.apply_async(self.exec_task,callback=self.exec_task_finished)
77+
self.url_queue.join()
78+
pass
79+
80+
if __name__ == '__main__':
81+
max_page = input("请输入您需要多少页内容:")
82+
spider = QiushiSpider(int(max_page))
83+
spider.run()

0 commit comments

Comments
 (0)