1. 多线程优化 : — 见代码
(1) - 多线程优化.py :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from threading import Thread, Lock
from queue import Queue
import requests
from bs4 import BeautifulSoup
import json, time

'''
采集线程退出条件:当页码队列为空时退出
解析线程退出条件:首先判断页码队列是否为空,如果页码队列为空,然后再去判断数据队列是否为空,如果数据队列也为空,那么解析线程退出
'''
# 标记解析线程何时退出标记位
g_flag = True

class CrawlThread(Thread):
def __init__(self, name, page_queue, data_queue):
super().__init__()
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = 'http://www.fanjian.net/duanzi-{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
}

def run(self):
print('线程--%s--启动成功----' % self.name)
'''
1、从页码队列取出一个页码
2、拼接生成待发送的url
3、发送请求,得到响应
4、将响应添加到数据队列中
'''
while 1:
# 如果页码队列为空,那么该线程就应该退出
if self.page_queue.empty():
break
page = self.page_queue.get()
url = self.url.format(page)
content = requests.get(url=url, headers=self.headers).text
self.data_queue.put(content)
print('线程-%s-运行结束--' % self.name)

class ParseThread(Thread):
def __init__(self, name, data_queue, fp, lock):
super().__init__()
self.name = name
self.data_queue = data_queue
self.fp = fp
self.lock = lock

def run(self):
print('线程--%s--启动成功----' % self.name)
'''
1、从数据队列中取出一个数据
2、解析和处理数据
'''
while 1:
if g_flag == False:
break
try:
content = self.data_queue.get(True, 3)
except Exception as e:
break

self.parse_content(content)

print('线程-%s-运行结束--' % self.name)

def parse_content(self, content):
# 生成soup对象
soup = BeautifulSoup(content, 'lxml')
resps = soup.select('.cont-item')
for resp in resps:
title = resp.select('a')[0]['title']
text = resp.select('.cont-list-main p')[0].string
item = {
'用户名': title,
'内容': text
}
# 开始解析
string = json.dumps(item, ensure_ascii=False)
self.lock.acquire()
self.fp.write(string + '\n')
self.lock.release()

def create_queue():
page_queue = Queue()
data_queue = Queue()
for page in range(1, 8):
page_queue.put(page)
return page_queue, data_queue

def main():
# 搞一个列表,用来存放所有的线程对象
t_list = []
# 打开文件
fp = open('jian.txt', 'w', encoding='utf8')
# 创建锁
lock = Lock()
# 在这创建队列
page_queue, data_queue = create_queue()
# 创建采集线程
crawl_name_list = ['采集线程1', '采集线程2', '采集线程3']
for crawl_name in crawl_name_list:
t_crawl = CrawlThread(crawl_name, page_queue, data_queue)
t_crawl.start()
# 将线程保存到列表中
t_list.append(t_crawl)
# 创建解析线程
parse_name_list = ['解析线程1', '解析线程2', '解析线程3']
for parse_name in parse_name_list:
t_parse = ParseThread(parse_name, data_queue, fp, lock)
t_parse.start()
t_list.append(t_parse)

# 一直判断页码队列是否为空
while 1:
if page_queue.empty():
break
time.sleep(3)
while 1:
if data_queue.empty():
global g_flag
g_flag = False
break

# 主线程要等待所有的子线程结束我才能结束
for t_tmp in t_list:
t_tmp.join()

# 关闭文件
fp.close()
print('主线程、子线程全部结束')

if __name__ == '__main__':
main()

'''
1、文件解析,每一个段子解析为字典
2、采集线程和解析线程都是死循环,何时退出线程呢?
'''
(2) - 多线程优化.py :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from threading import Thread, Lock
from queue import Queue
import requests
from bs4 import BeautifulSoup
import json, time

'''
采集线程退出条件:当页码队列为空时退出
解析线程退出条件:首先判断页码队列是否为空,如果页码队列为空,然后再去判断数据队列是否为空,如果数据队列也为空,那么解析线程退出
'''
# 标记解析线程何时退出标记位
g_flag = True

class CrawlThread(Thread):
def __init__(self, name, page_queue, data_queue):
super().__init__()
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = 'http://www.fanjian.net/duanzi-{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
}

def run(self):
print('线程--%s--启动成功----' % self.name)
'''
1、从页码队列取出一个页码
2、拼接生成待发送的url
3、发送请求,得到响应
4、将响应添加到数据队列中
'''
while 1:
# 如果页码队列为空,那么该线程就应该退出
if self.page_queue.empty():
break
page = self.page_queue.get()
url = self.url.format(page)
content = requests.get(url=url, headers=self.headers).text
self.data_queue.put(content)
print('线程-%s-运行结束--' % self.name)

class ParseThread(Thread):
def __init__(self, name, data_queue, fp, lock):
super().__init__()
self.name = name
self.data_queue = data_queue
self.fp = fp
self.lock = lock

def run(self):
print('线程--%s--启动成功----' % self.name)
'''
1、从数据队列中取出一个数据
2、解析和处理数据
'''
while 1:
if g_flag == False:
break
try:
content = self.data_queue.get(True, 3)
except Exception as e:
break
self.parse_content(content)

print('线程-%s-运行结束--' % self.name)

def parse_content(self, content):
# 生成soup对象
soup = BeautifulSoup(content, 'lxml')
resps = soup.select('.cont-item')
for resp in resps:
title = resp.select('a')[0]['title']
text = resp.select('.cont-list-main p')[0].string
item = {
'用户名': title,
'内容': text
}
# 开始解析
string = json.dumps(item, ensure_ascii=False)
self.lock.acquire()
self.fp.write(string + '\n')
self.lock.release()

def create_queue():
page_queue = Queue()
data_queue = Queue()
for page in range(1, 8):
page_queue.put(page)
return page_queue, data_queue

def main():
# 搞一个列表,用来存放所有的线程对象
t_crawl_list = []
t_parse_list = []
# 打开文件
fp = open('jian.txt', 'w', encoding='utf8')
# 创建锁
lock = Lock()
# 在这创建队列
page_queue, data_queue = create_queue()
# 创建采集线程
crawl_name_list = ['采集线程1', '采集线程2', '采集线程3']
for crawl_name in crawl_name_list:
t_crawl = CrawlThread(crawl_name, page_queue, data_queue)
t_crawl.start()
# 将线程保存到列表中
t_crawl_list.append(t_crawl)
# 创建解析线程
parse_name_list = ['解析线程1', '解析线程2', '解析线程3']
for parse_name in parse_name_list:
t_parse = ParseThread(parse_name, data_queue, fp, lock)
t_parse.start()
t_parse_list.append(t_parse)

for t_crawl in t_crawl_list:
t_crawl.join()

# 一直判断页码队列是否为空
# while 1:
# if page_queue.empty():
# break
time.sleep(3)
while 1:
if data_queue.empty():
global g_flag
g_flag = False
break

# 主线程要等待所有的子线程结束我才能结束
for t_parse in t_parse_list:
t_parse.join()

# 关闭文件
fp.close()
print('主线程、子线程全部结束')

if __name__ == '__main__':
main()

'''
1、文件解析,每一个段子解析为字典
2、采集线程和解析线程都是死循环,何时退出线程呢?
'''
2. 登录古诗文 :
1
2
3
4
登录 : 直接发送post, 然后发送get
登录 : 先发送get, 获取去一下信息, 然后再发送post, 然后发送get
登录 : 先发送get, 再发送post提交表单, 再发送get, 再发送get. 访问登录后的页面
验证码, 下载到本地, 手动输入
login.py :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import requests
from bs4 import BeautifulSoup
import urllib.request
from PIL import Image
import pytesseract
import time

def shibie(image_path):
img = Image.open(image_path)
# 转化为灰度图片
img = img.convert('L')
# 二值化处理
threshold = 140
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
out = img.point(table, '1')

# out.show()
# 识别图片
img = img.convert('RGB')

return pytesseract.image_to_string(img)

# 创建一个会话
s = requests.Session()

i = 1

while 1:
login_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
}
r_log = s.get(url=login_url, headers=headers)

# 将验证码图片下载到本地
soup = BeautifulSoup(r_log.text, 'lxml')
image_src = 'https://so.gushiwen.org' + soup.find('img', id="imgCode")['src']
# urllib.request.urlretrieve(image_src, 'code.png')
r_image = s.get(image_src, headers=headers)
with open('code.png', 'wb') as fp:
fp.write(r_image.content)

# 获取表单隐藏框中的值
views = soup.find('input', id="__VIEWSTATE")['value']
viewg = soup.find('input', id="__VIEWSTATEGENERATOR")['value']

# 发送post请求
# code = input('请输入验证码------')
code = shibie('code.png')

post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
formdata = {
'__VIEWSTATE': views,
'__VIEWSTATEGENERATOR': viewg,
'from': 'http://so.gushiwen.org/user/collect.aspx',
'email': '1090509990@qq.com',
'pwd': '123456',
'code': code,
'denglu': '登录',
}
r_post = s.post(url=post_url, headers=headers, data=formdata)

# with open('gushi.html', 'wb') as fp:
# fp.write(r_post.content)
# 判断是否登录成功
if '修改昵称' in r_post.text:
print('亲,第%s次登录成功------' % i)
break
print('不好意思,第%s次登录失败' % i)
i += 1
time.sleep(2)
gushi.html :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="Cache-Control" content="no-siteapp" /><meta http-equiv="Cache-Control" content="no-transform " /><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>
登录古诗文网
</title>
<link href="/user/reg.css" rel="stylesheet" type="text/css" />
<script src="/js/Code.js" type="text/javascript"></script>
<script type="text/javascript">
if ((navigator.userAgent.match(/(phone|pad|pod|iPhone|iPod|ios|iPad|Android|Mobile|BlackBerry|IEMobile|MQQBrowser|JUC|Fennec|wOSBrowser|BrowserNG|WebOS|Symbian|Windows Phone)/i))) {
window.location.href = "https://m.gushiwen.org/user/login.aspx";
} else {

}
</script>
<link href="/css/skinSo20180726.css" rel="stylesheet" type="text/css" />
<script>
var _hmt = _hmt || [];
(function () {
var hm = document.createElement("script");
hm.src = "//hm.baidu.com/hm.js?04660099568f561a75456483228a9516";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
</head>
<body onclick="closeshowBos()">
<div class="main1">
<div class="cont">
<div class="left">
<a href="https://www.gushiwen.org/">古诗文网</a>
</div>
<div class="right">
<div class="son1">
<a style="margin-left:1px;" href="https://www.gushiwen.org/">推荐</a>
<a href="https://www.gushiwen.org/shiwen/">诗文</a>
<a href="/mingju/">名句</a>
<a href="/authors/">作者</a>
<a href="/guwen/">古籍</a>
<a href="/user/collect.aspx" rel="nofollow" style="background-color:#757863;border-bottom:3px solid #F0EFE2;line-height:43px; height:43px;">收藏</a>
<a style="width:65px;" href="/app/" target="_blank">手机版</a>
</div>
<div class="son2">
<div class="search">
<form action="/search.aspx" onsubmit="return selectSearch()" contentType="text/html; charset=utf-8">
<input onkeydown="noajaxkeyUp()" onfocus="setInterval('showBos()',1000)" id="txtKey" name="value" type="text" value="" maxlength="40" autocomplete="off" style="height:25px; line-height:25px; float:left; padding-left:5px; width:264px; font-size:14px; clear:left; border:0px;" />
<input type="submit" style="float:right; width:20px; height:20px; clear:right; margin-top:4px; margin-right:3px; background-image:url(/img/docSearch.png); background-repeat:no-repeat; background-size:20px 20px; border:0px;cursor:pointer;" value="" />
<input id="b" style="display:none;" type="text" />
</form>
</div>
<div id="box"></div>
</div>
</div>
</div>
</div>
<form name="aspnetForm" method="post" action="./login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx" id="aspnetForm" onsubmit="return testFrom()">
<div>
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="4NrBq/AG/YO+IS50yM+meKBeV0FjT94GVAQ3CtZZyQ+6pvV6bmjpEjfmNmTInyCREasu+1VV6LCxP3O/pcynR6A5ngXQD5jev+uBtfmK+pUmhz3eEGsyAkpuiZc=" />
</div>
<div>
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="C93BE1AE" />
</div>
<div class="mainreg"><span><b>登录古诗文网</b></span>
<span style="float:right; font-size:18px;">我还没有账号,先去<a href="/user/register.aspx">注册</a></span>
</div>
<div class="mainreg2">
<span>邮 箱</span>
<input style="display:none;" type="text" id="from" name="from" value="http://so.gushiwen.org/user/collect.aspx" />
<input type="text" id="email" name="email" onblur="testEmail()" onfocus="onEmial()" style=" height:25px; width:250px; font-size:14px; line-height:25px; padding-left:5px; margin-left:10px; float:left;" maxlength="50" value="" />
<span id="emailNo" style="color:Red; margin-left:5px; display:none;">ㄨ Email格式不正确</span>
</div>
<div class="mainreg2">
<span>密 码</span>
<input type="password" id="pwd" name="pwd" onblur="testPwd()" onfocus="onPwd()" style=" height:25px; width:250px; font-size:14px; line-height:25px; padding-left:5px; margin-left:10px; float:left;" maxlength="20" value="" />
<span id="pwdNo" style="color:Red; margin-left:5px; display:none;">ㄨ 长度为6~20个字符</span>
</div>
<div class="mainreg2">
<span>验证码</span>
<input type="text" id="code" name="code" style=" height:25px; width:42px; font-size:14px; line-height:25px; padding-left:5px; margin-left:10px; float:left;" maxlength="4" value="" />
<img id="imgCode" style="cursor: pointer; float:left; margin-left:5px; margin-top:1px;" width="60" height="25" src="/RandCode.ashx" onclick="GetCodeImg()" alt="看不清,换一张" />
</div>
<div class="mainreg2">
<span style="color:#E1E0C7;">自 动</span>
<a href="/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx" style="float:left; margin-top:2px; margin-left:10px; ">忘记密码了</a>
</div>
<div class="mainreg2">
<span style="color:#E1E0C7;">登 陆</span>
<input type="submit" id="denglu" name="denglu" style=" height:25px; width:48px; font-size:14px; line-height:16px; margin-left:10px; float:left; cursor:pointer;" value="登录" />
</div>
<script>alert('提交失败,您输入的验证码有误!');history.back();</script></form>
<script defer="defer" src="login.js" type="text/javascript"></script>
<div class="main4">
© 2018 <a href="https://www.gushiwen.org/">古诗文网</a> | <a href="https://www.gushiwen.org/shiwen/">诗文</a> | <a href="/mingju/">名句</a> | <a href="/authors/">作者</a> | <a href="/guwen/">古籍</a> | <a href="/jiucuo.aspx?u=" target="_blank" rel="nofollow">纠错</a>
</div>
<script type="text/javascript">
window.onload = function () {
setIframeHeight(document.getElementById('external-frame'));
};
</script>
<script defer="defer" src="/js/skinso20180608.js" type="text/javascript"></script>
<script defer="defer" src="/js/jquery-3.2.1.min.js" type="text/javascript"></script>
<script defer="defer" src="/js/jquery.qrcode.min.js" type="text/javascript"></script>
</body>
</html>
3. 自动识别验证码 :
1
2
3
4
5
6
7
8
9
(1) 光学识别  tesseract -- 需安装
指令识别
识别率不行, 训练它
代码识别
pip install pytesseract
pip install pillow
通过图像处理处理一下图片, 然后再去识别, 提高识别率
(2) 打码平台
云打码
tesseract.py :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pytesseract
from PIL import Image
from PIL import ImageEnhance

img = Image.open('./code/office.png')
# 转化为灰度图片
img = img.convert('L')

# enhancer = ImageEnhance.Color(img)
# enhancer = enhancer.enhance(0)
# enhancer = ImageEnhance.Brightness(enhancer)
# enhancer = enhancer.enhance(2)
# enhancer = ImageEnhance.Contrast(enhancer)
# enhancer = enhancer.enhance(8)
# enhancer = ImageEnhance.Sharpness(enhancer)
# img = enhancer.enhance(20)

# 二值化处理
# 二值化处理
threshold = 140
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
out = img.point(table, '1')

# out.show()
# 识别图片
img = img.convert('RGB')

print(pytesseract.image_to_string(img))
4. scrapy :
scrapy运行原理图 :

scrapy运行原理图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
scrapy是什么 ? 是一个非常强大, 精悍的Python网络爬虫框架, 它的底层使用Python语言实现的, 肯定就都集成了多进程, 多线程, 队列等技术, 人家会给我们留下一些接口, 只需要将精力放到提取url和解析页面中即可
http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html
http://www.index.html?username=goudan&password=123
http://www.index.html?password=123&username=goudan
url指纹去重

安装 : pip install scrapy
(1)认识框架
引擎 (engine) \ 爬虫文件 (spider) \ 调度器 (scheduler) \ 下载器 (downloader) \ 管道 (pipeline)
(2)使用框架
scrapy startproject xxx
(3)认识目录结构
firstbloodpro 工程总目录
firstbloodpro 工程目录
__pycache__ 缓存目录
spiders 爬虫目录
__pycache__ 缓存目录
__init__.py 包的标记
lala.py 爬虫文件(*) --- 写代码的地方
__init__.py 包的标记
items.py 定义数据结构的地方(*)
middlewares.py 中间件
pipelines.py 管道文件(*)
settings.py 配置文件(*)
scrapy.cfg 工程配置信息(一般不用)

(4)生成爬虫文件
cd firstbloodpro
scrapy genspider xxx www.xxx.com
生成的参数的意思见 qiubai.py
(5)运行起来
cd firstbloodpro/firstbloodpro/spiders
scrapy crawl qiubai
修改settings.py, 将遵从robots协议去掉, 将UA定制一下
(6)认识response对象
response.text : 字符串格式内容
response.body : 字节格式内容
response.url : 请求的url
response.headers : 响应的头部
response.status_code : 得到状态码
在scrapy里面, 已经为你集成了xpath, 直接使用即可
response.xpath('')
(7)一键指定输出
scrapy crawl qiubai -o qiubai.json
scrapy crawl qiubai -o qiubai.xml
scrapy crawl qiubai -o qiubai.csv

解决输出csv有空行问题
https://blog.csdn.net/qq_38282706/article/details/80279912
× 请我吃糖~
打赏二维码