Spider

1. 多线程优化 : — 见代码

(1) - 多线程优化.py :

from threading import Thread, Lock
from queue import Queue
import requests
from bs4 import BeautifulSoup
import json, time

'''
采集线程退出条件：当页码队列为空时退出
解析线程退出条件：首先判断页码队列是否为空，如果页码队列为空，然后再去判断数据队列是否为空，如果数据队列也为空，那么解析线程退出
'''
# 标记解析线程何时退出标记位
g_flag = True

class CrawlThread(Thread):
    def __init__(self, name, page_queue, data_queue):
        super().__init__()
        self.name = name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.url = 'http://www.fanjian.net/duanzi-{}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
        }

    def run(self):
        print('线程--%s--启动成功----' % self.name)
        '''
        1、从页码队列取出一个页码
        2、拼接生成待发送的url
        3、发送请求，得到响应
        4、将响应添加到数据队列中
        '''
        while 1:
            # 如果页码队列为空，那么该线程就应该退出
            if self.page_queue.empty():
                break
            page = self.page_queue.get()
            url = self.url.format(page)
            content = requests.get(url=url, headers=self.headers).text
            self.data_queue.put(content)
        print('线程-%s-运行结束--' % self.name)

class ParseThread(Thread):
    def __init__(self, name, data_queue, fp, lock):
        super().__init__()
        self.name = name
        self.data_queue = data_queue
        self.fp = fp
        self.lock = lock

    def run(self):
        print('线程--%s--启动成功----' % self.name)
        '''
        1、从数据队列中取出一个数据
        2、解析和处理数据
        '''
        while 1:
            if g_flag == False:
                break
            try:
                content = self.data_queue.get(True, 3)
            except Exception as e:
                break
            
            self.parse_content(content)
            
        print('线程-%s-运行结束--' % self.name)
    
    def parse_content(self, content):
        # 生成soup对象
        soup = BeautifulSoup(content, 'lxml')
        resps = soup.select('.cont-item')
        for resp in resps:
            title = resp.select('a')[0]['title']
            text = resp.select('.cont-list-main p')[0].string
            item = {
                '用户名': title,
                '内容': text
            }
            # 开始解析
            string = json.dumps(item, ensure_ascii=False)
            self.lock.acquire()
            self.fp.write(string + '\n')
            self.lock.release()

def create_queue():
    page_queue = Queue()
    data_queue = Queue()
    for page in range(1, 8):
        page_queue.put(page)
    return page_queue, data_queue

def main():
    # 搞一个列表，用来存放所有的线程对象
    t_list = []
    # 打开文件
    fp = open('jian.txt', 'w', encoding='utf8')
    # 创建锁
    lock = Lock()
    # 在这创建队列
    page_queue, data_queue = create_queue()
    # 创建采集线程
    crawl_name_list = ['采集线程1', '采集线程2', '采集线程3']
    for crawl_name in crawl_name_list:
        t_crawl = CrawlThread(crawl_name, page_queue, data_queue)
        t_crawl.start()
        # 将线程保存到列表中
        t_list.append(t_crawl)
    # 创建解析线程
    parse_name_list = ['解析线程1', '解析线程2', '解析线程3']
    for parse_name in parse_name_list:
        t_parse = ParseThread(parse_name, data_queue, fp, lock)
        t_parse.start()
        t_list.append(t_parse)
    
    # 一直判断页码队列是否为空
    while 1:
        if page_queue.empty():
            break
    time.sleep(3)
    while 1:
        if data_queue.empty():
            global g_flag
            g_flag = False
            break
    
    # 主线程要等待所有的子线程结束我才能结束
    for t_tmp in t_list:
        t_tmp.join()
    
    # 关闭文件
    fp.close()
    print('主线程、子线程全部结束')

if __name__ == '__main__':
    main()

'''
1、文件解析，每一个段子解析为字典
2、采集线程和解析线程都是死循环，何时退出线程呢？
'''

(2) - 多线程优化.py :

from threading import Thread, Lock
from queue import Queue
import requests
from bs4 import BeautifulSoup
import json, time

'''
采集线程退出条件：当页码队列为空时退出
解析线程退出条件：首先判断页码队列是否为空，如果页码队列为空，然后再去判断数据队列是否为空，如果数据队列也为空，那么解析线程退出
'''
# 标记解析线程何时退出标记位
g_flag = True

class CrawlThread(Thread):
    def __init__(self, name, page_queue, data_queue):
        super().__init__()
        self.name = name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.url = 'http://www.fanjian.net/duanzi-{}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
        }

    def run(self):
        print('线程--%s--启动成功----' % self.name)
        '''
        1、从页码队列取出一个页码
        2、拼接生成待发送的url
        3、发送请求，得到响应
        4、将响应添加到数据队列中
        '''
        while 1:
            # 如果页码队列为空，那么该线程就应该退出
            if self.page_queue.empty():
                break
            page = self.page_queue.get()
            url = self.url.format(page)
            content = requests.get(url=url, headers=self.headers).text
            self.data_queue.put(content)
        print('线程-%s-运行结束--' % self.name)

class ParseThread(Thread):
    def __init__(self, name, data_queue, fp, lock):
        super().__init__()
        self.name = name
        self.data_queue = data_queue
        self.fp = fp
        self.lock = lock

    def run(self):
        print('线程--%s--启动成功----' % self.name)
        '''
        1、从数据队列中取出一个数据
        2、解析和处理数据
        '''
        while 1:
            if g_flag == False:
                break
            try:
                content = self.data_queue.get(True, 3)
            except Exception as e:
                break
            self.parse_content(content)
            
        print('线程-%s-运行结束--' % self.name)
    
    def parse_content(self, content):
        # 生成soup对象
        soup = BeautifulSoup(content, 'lxml')
        resps = soup.select('.cont-item')
        for resp in resps:
            title = resp.select('a')[0]['title']
            text = resp.select('.cont-list-main p')[0].string
            item = {
                '用户名': title,
                '内容': text
            }
            # 开始解析
            string = json.dumps(item, ensure_ascii=False)
            self.lock.acquire()
            self.fp.write(string + '\n')
            self.lock.release()

def create_queue():
    page_queue = Queue()
    data_queue = Queue()
    for page in range(1, 8):
        page_queue.put(page)
    return page_queue, data_queue

def main():
    # 搞一个列表，用来存放所有的线程对象
    t_crawl_list = []
    t_parse_list = []
    # 打开文件
    fp = open('jian.txt', 'w', encoding='utf8')
    # 创建锁
    lock = Lock()
    # 在这创建队列
    page_queue, data_queue = create_queue()
    # 创建采集线程
    crawl_name_list = ['采集线程1', '采集线程2', '采集线程3']
    for crawl_name in crawl_name_list:
        t_crawl = CrawlThread(crawl_name, page_queue, data_queue)
        t_crawl.start()
        # 将线程保存到列表中
        t_crawl_list.append(t_crawl)
    # 创建解析线程
    parse_name_list = ['解析线程1', '解析线程2', '解析线程3']
    for parse_name in parse_name_list:
        t_parse = ParseThread(parse_name, data_queue, fp, lock)
        t_parse.start()
        t_parse_list.append(t_parse)
    
    for t_crawl in t_crawl_list:
        t_crawl.join()
    
    # 一直判断页码队列是否为空
    # while 1:
    #     if page_queue.empty():
    #         break
    time.sleep(3)
    while 1:
        if data_queue.empty():
            global g_flag
            g_flag = False
            break
    
    # 主线程要等待所有的子线程结束我才能结束
    for t_parse in t_parse_list:
        t_parse.join()
    
    # 关闭文件
    fp.close()
    print('主线程、子线程全部结束')

if __name__ == '__main__':
    main()

'''
1、文件解析，每一个段子解析为字典
2、采集线程和解析线程都是死循环，何时退出线程呢？
'''

2. 登录古诗文 :

登录 : 直接发送post, 然后发送get
登录 : 先发送get, 获取去一下信息, 然后再发送post, 然后发送get
登录 : 先发送get, 再发送post提交表单, 再发送get, 再发送get. 访问登录后的页面
验证码, 下载到本地, 手动输入

import requests
from bs4 import BeautifulSoup
import urllib.request
from PIL import Image
import pytesseract
import time

def shibie(image_path):
    img = Image.open(image_path)
    # 转化为灰度图片
    img = img.convert('L')
    # 二值化处理
    threshold = 140
    table = []
    for i in range(256):
        if i < threshold:
            table.append(0)
        else:
            table.append(1)
    out = img.point(table, '1')

    # out.show()
    # 识别图片
    img = img.convert('RGB')

    return pytesseract.image_to_string(img)

# 创建一个会话
s = requests.Session()

i = 1

while 1:
    login_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
    }
    r_log = s.get(url=login_url, headers=headers)

    # 将验证码图片下载到本地
    soup = BeautifulSoup(r_log.text, 'lxml')
    image_src = 'https://so.gushiwen.org' + soup.find('img', id="imgCode")['src']
    # urllib.request.urlretrieve(image_src, 'code.png')
    r_image = s.get(image_src, headers=headers)
    with open('code.png', 'wb') as fp:
        fp.write(r_image.content)

    # 获取表单隐藏框中的值
    views = soup.find('input', id="__VIEWSTATE")['value']
    viewg = soup.find('input', id="__VIEWSTATEGENERATOR")['value']

    # 发送post请求
    # code = input('请输入验证码------')
    code = shibie('code.png')

    post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
    formdata = {
        '__VIEWSTATE': views,
        '__VIEWSTATEGENERATOR': viewg,
        'from': 'http://so.gushiwen.org/user/collect.aspx',
        'email': '1090509990@qq.com',
        'pwd': '123456',
        'code': code,
        'denglu': '登录',
    }
    r_post = s.post(url=post_url, headers=headers, data=formdata)

    # with open('gushi.html', 'wb') as fp:
    #     fp.write(r_post.content)
    # 判断是否登录成功
    if '修改昵称' in r_post.text:
        print('亲，第%s次登录成功------' % i)
        break
    print('不好意思，第%s次登录失败' % i)
    i += 1
    time.sleep(2)

gushi.html :

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><meta http-equiv="Cache-Control" content="no-siteapp" /><meta http-equiv="Cache-Control" content="no-transform " /><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>
登录古诗文网
</title>
<link href="/user/reg.css" rel="stylesheet" type="text/css" />
<script src="/js/Code.js" type="text/javascript"></script>
<script type="text/javascript">
        if ((navigator.userAgent.match(/(phone|pad|pod|iPhone|iPod|ios|iPad|Android|Mobile|BlackBerry|IEMobile|MQQBrowser|JUC|Fennec|wOSBrowser|BrowserNG|WebOS|Symbian|Windows Phone)/i))) {
            window.location.href = "https://m.gushiwen.org/user/login.aspx";
        } else {

        }
</script>
<link href="/css/skinSo20180726.css" rel="stylesheet" type="text/css" />
<script>
        var _hmt = _hmt || [];
        (function () {
            var hm = document.createElement("script");
            hm.src = "//hm.baidu.com/hm.js?04660099568f561a75456483228a9516";
            var s = document.getElementsByTagName("script")[0];
            s.parentNode.insertBefore(hm, s);
        })();
    </script>
</head>
<body onclick="closeshowBos()">
<div class="main1">
<div class="cont">
<div class="left">
<a href="https://www.gushiwen.org/">古诗文网</a>
</div>
<div class="right">
<div class="son1">
<a style="margin-left:1px;" href="https://www.gushiwen.org/">推荐</a>
<a href="https://www.gushiwen.org/shiwen/">诗文</a>
<a href="/mingju/">名句</a>
<a href="/authors/">作者</a>
<a href="/guwen/">古籍</a>
<a href="/user/collect.aspx" rel="nofollow" style="background-color:#757863;border-bottom:3px solid #F0EFE2;line-height:43px; height:43px;">收藏</a>
<a style="width:65px;" href="/app/" target="_blank">手机版</a>
</div>
<div class="son2">
<div class="search">
<form action="/search.aspx" onsubmit="return selectSearch()" contentType="text/html; charset=utf-8">
<input onkeydown="noajaxkeyUp()" onfocus="setInterval('showBos()',1000)" id="txtKey" name="value" type="text" value="" maxlength="40" autocomplete="off" style="height:25px; line-height:25px; float:left; padding-left:5px; width:264px; font-size:14px; clear:left; border:0px;" />
<input type="submit" style="float:right; width:20px; height:20px; clear:right; margin-top:4px; margin-right:3px; background-image:url(/img/docSearch.png); background-repeat:no-repeat; background-size:20px 20px; border:0px;cursor:pointer;" value="" />
<input id="b" style="display:none;" type="text" />
</form>
</div>
<div id="box"></div>
</div>
</div>
</div>
</div>
<form name="aspnetForm" method="post" action="./login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx" id="aspnetForm" onsubmit="return testFrom()">
<div>
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="4NrBq/AG/YO+IS50yM+meKBeV0FjT94GVAQ3CtZZyQ+6pvV6bmjpEjfmNmTInyCREasu+1VV6LCxP3O/pcynR6A5ngXQD5jev+uBtfmK+pUmhz3eEGsyAkpuiZc=" />
</div>
<div>
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="C93BE1AE" />
</div>
<div class="mainreg"><span><b>登录古诗文网</b></span>
<span style="float:right; font-size:18px;">我还没有账号，先去<a href="/user/register.aspx">注册</a></span>
</div>
<div class="mainreg2">
<span>邮　箱</span>
<input style="display:none;" type="text" id="from" name="from" value="http://so.gushiwen.org/user/collect.aspx" />
<input type="text" id="email" name="email" onblur="testEmail()" onfocus="onEmial()" style=" height:25px; width:250px; font-size:14px; line-height:25px; padding-left:5px; margin-left:10px; float:left;" maxlength="50" value="" />
<span id="emailNo" style="color:Red; margin-left:5px; display:none;">ㄨ Email格式不正确</span>
</div>
<div class="mainreg2">
<span>密　码</span>
<input type="password" id="pwd" name="pwd" onblur="testPwd()" onfocus="onPwd()" style=" height:25px; width:250px; font-size:14px; line-height:25px; padding-left:5px; margin-left:10px; float:left;" maxlength="20" value="" />
<span id="pwdNo" style="color:Red; margin-left:5px; display:none;">ㄨ 长度为6~20个字符</span>
</div>
<div class="mainreg2">
<span>验证码</span>
<input type="text" id="code" name="code" style=" height:25px; width:42px; font-size:14px; line-height:25px; padding-left:5px; margin-left:10px; float:left;" maxlength="4" value="" />
<img id="imgCode" style="cursor: pointer; float:left; margin-left:5px; margin-top:1px;" width="60" height="25" src="/RandCode.ashx" onclick="GetCodeImg()" alt="看不清，换一张" />
</div>
<div class="mainreg2">
<span style="color:#E1E0C7;">自　动</span>
<a href="/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx" style="float:left; margin-top:2px; margin-left:10px; ">忘记密码了</a>
</div>
<div class="mainreg2">
<span style="color:#E1E0C7;">登　陆</span>
<input type="submit" id="denglu" name="denglu" style=" height:25px; width:48px; font-size:14px; line-height:16px; margin-left:10px; float:left; cursor:pointer;" value="登录" />
</div>
<script>alert('提交失败，您输入的验证码有误！');history.back();</script></form>
<script defer="defer" src="login.js" type="text/javascript"></script>
<div class="main4">
© 2018 <a href="https://www.gushiwen.org/">古诗文网</a> | <a href="https://www.gushiwen.org/shiwen/">诗文</a> | <a href="/mingju/">名句</a> | <a href="/authors/">作者</a> | <a href="/guwen/">古籍</a> | <a href="/jiucuo.aspx?u=" target="_blank" rel="nofollow">纠错</a>
</div>
<script type="text/javascript">
    window.onload = function () {
        setIframeHeight(document.getElementById('external-frame'));
    };
        </script>
<script defer="defer" src="/js/skinso20180608.js" type="text/javascript"></script>
<script defer="defer" src="/js/jquery-3.2.1.min.js" type="text/javascript"></script>
<script defer="defer" src="/js/jquery.qrcode.min.js" type="text/javascript"></script>
</body>
</html>

3. 自动识别验证码 :

(1) 光学识别  tesseract -- 需安装
	指令识别
		识别率不行, 训练它
	代码识别
		pip install pytesseract
		pip install pillow
		通过图像处理处理一下图片, 然后再去识别, 提高识别率
(2) 打码平台
	云打码

tesseract.py :

import pytesseract
from PIL import Image
from PIL import ImageEnhance

img = Image.open('./code/office.png')
# 转化为灰度图片
img = img.convert('L')

# enhancer = ImageEnhance.Color(img)
# enhancer = enhancer.enhance(0)
# enhancer = ImageEnhance.Brightness(enhancer)
# enhancer = enhancer.enhance(2)
# enhancer = ImageEnhance.Contrast(enhancer)
# enhancer = enhancer.enhance(8)
# enhancer = ImageEnhance.Sharpness(enhancer)
# img = enhancer.enhance(20)

# 二值化处理
# 二值化处理
threshold = 140
table = []
for i in range(256):
    if i < threshold:
        table.append(0)
    else:
        table.append(1)
out = img.point(table, '1')

# out.show()
# 识别图片
img = img.convert('RGB')

print(pytesseract.image_to_string(img))

4. scrapy :

scrapy运行原理图 :

scrapy运行原理图

scrapy是什么 ? 是一个非常强大, 精悍的Python网络爬虫框架, 它的底层使用Python语言实现的, 肯定就都集成了多进程, 多线程, 队列等技术, 人家会给我们留下一些接口, 只需要将精力放到提取url和解析页面中即可
http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html
http://www.index.html?username=goudan&password=123
http://www.index.html?password=123&username=goudan
url指纹去重

安装 : pip install scrapy
(1)认识框架
	引擎 (engine) \ 爬虫文件 (spider) \ 调度器 (scheduler) \ 下载器 (downloader) \ 管道 (pipeline)
(2)使用框架
	scrapy startproject xxx
(3)认识目录结构
	firstbloodpro			工程总目录
		firstbloodpro		工程目录
			__pycache__		缓存目录
			spiders			爬虫目录
				__pycache__	缓存目录
				__init__.py	包的标记
				lala.py		爬虫文件(*) --- 写代码的地方
			__init__.py		包的标记
			items.py		定义数据结构的地方(*)
			middlewares.py	中间件
			pipelines.py	管道文件(*)
			settings.py		配置文件(*)
		scrapy.cfg    		工程配置信息(一般不用)
		
(4)生成爬虫文件
	cd firstbloodpro
	scrapy genspider xxx www.xxx.com
	生成的参数的意思见 qiubai.py
(5)运行起来
	cd firstbloodpro/firstbloodpro/spiders
	scrapy crawl qiubai
	修改settings.py, 将遵从robots协议去掉, 将UA定制一下
(6)认识response对象
	response.text : 字符串格式内容
	response.body : 字节格式内容
	response.url : 请求的url
	response.headers : 响应的头部
	response.status_code : 得到状态码
	在scrapy里面, 已经为你集成了xpath, 直接使用即可
	response.xpath('')
(7)一键指定输出
	scrapy crawl qiubai -o qiubai.json
	scrapy crawl qiubai -o qiubai.xml
	scrapy crawl qiubai -o qiubai.csv
	
	解决输出csv有空行问题
	https://blog.csdn.net/qq_38282706/article/details/80279912

最后更新： 2018年08月15日 19:10

原始链接： http://yoursite.com/2018/08/15/多线程优化,登录古诗文,自动识别验证码,云打码平台,scrapy学习,scrapy简单使用/

赏

lronLin's Blog

简洁是智慧的灵魂, 冗长是肤浅的藻饰!

Spider

1. 多线程优化 : — 见代码

(1) - 多线程优化.py :

(2) - 多线程优化.py :

2. 登录古诗文 :

gushi.html :

3. 自动识别验证码 :

tesseract.py :

4. scrapy :

scrapy运行原理图 :

1. 多线程优化 : — 见代码

(1) - 多线程优化.py :

(2) - 多线程优化.py :

2. 登录古诗文 :

login.py :

gushi.html :

3. 自动识别验证码 :

tesseract.py :

4. scrapy :

scrapy运行原理图 :