由于这几天有几千个本地html文档需要处理。并且要爬取里面的图片出来。刚好也学了多线程。于是就有了这个脚本的出现。

先放代码

#coding:utf-8

import os
import sys
import requests
import threading
import Queue
import time
import random
from lxml import etree

global_flag = False
path = 'E:\\33\\html\\'
lock = threading.Lock()

user_agent = ['Baiduspider+(+http://www.baidu.com/search/spider.htm”)','Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)','Googlebot/2.1 (+http://www.googlebot.com/bot.html)','Googlebot/2.1 (+http://www.google.com/bot.html)','Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html”)','Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp”)','iaskspider/2.0(+http://iask.com/help/help_index.html”)','Mozilla/5.0 (compatible; iaskspider/1.0; MSIE 6.0)','Sogou web spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07″)','Sogou Push Spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07″)','Mozilla/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/”; )','msnbot/1.0 (+http://search.msn.com/msnbot.htm”)']


class worker(threading.Thread):
    def __init__(self,q):
        self.q = q
        threading.Thread.__init__(self)
    def run(self):
        while (self.q.qsize() > 0  and not global_flag):
            file_name = self.q.get_nowait()
            self.do(file_name)
    def do(self,file):
        with open(path+file,'r') as picfile:
            r = picfile.read()
        selector = etree.HTML(r)
        content = selector.xpath('//img[@width="400"]/@src')
        for u in content:
            time.sleep(1)
            try:
                lock.acquire()
                rurl = requests.get(u,headers=ua())
                lock.release()
                if rurl.status_code != 404:
                    with open('E:\\33\\upload\\'+u[u.rfind('/')+1:], 'wb') as pic:
                        pic.write(rurl.content)
                        print file + ': photos download success'
                        global_flag = True
                else:
                    with open("E:\\33\\404\\404.txt", 'a') as pic:
                        pic.write("404 url:"+u+'\n')
            except:
                f = open("E:\\33\\404\\error.txt",'a')
                error_file = file+":"
                error_url = u+"\n"
                f.write(error_file+error_url)
                f.close()
def ua():
    ran = random.randint(0,11)
    head = {'User-Agent':user_agent[ran]}
    return head

def main():
    q = Queue.Queue(0)
    threads = []
    for p,dict,file in os.walk(path):
        for f in file:
            q.put(f)
    for x in range(10):
        w = worker(q)
        w.setDaemon(True)
        w.start()
        threads.append(w)
    for j in threads:
        j.join()
    
if __name__ == '__main__':
    main()
    requests.get("http://api.xxx.com/sms?u=xxx&p=xxx&m=phone&c=【关机】你的电脑在 " + str(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))+"关机了" )
    os.system('shutdown -s -f -t 0')

代码写的有点乱,毕竟新手。
这里解释一下为啥要延时和加锁还有修改UA头,那是因为在测试过程中有些地址不允许频繁访问。修改UA是让脚本伪装成一个公共的爬虫。这个脚本还有可以改进的地方。列如加进timeout参数等等。各位有什么问题可以在评论下面评论我会尽量回答。