歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
您现在的位置: Linux教程網 >> UnixLinux >  >> Linux編程 >> Linux編程

Python通過代理多線程抓取圖片

Python作為一門功能強大的腳本語言,經常被用來寫爬蟲程序,下面是Python通過代理多線程抓取圖片代碼
Python爬蟲多線程抓取代理服務器參考:  http://www.linuxidc.com/Linux/2013-07/87289.htm
說明:
1. 多線程方式抓取代理服務器,並多線程驗證代理服務器
ps 代理服務器是從http://www.88181.com/ (測試只選擇了8個頁面)抓取

2. 抓取一個網站的圖片地址,多線程隨機取一個代理服務器下載圖片
ps 圖片網站地址:http://www.ivsky.com(測試只選擇了有限的頁面數) #!/usr/bin/env python
#BLOG:blog.linuxeye.com
#coding:utf-8

import urllib2
import re
import threading
import time
import random

rawProxyList = []
checkedProxyList = []
imgurl_list = []

#抓取代理網站
portdicts ={'v':"3",'m':"4",'a':"2",'l':"9",'q':"0",'b':"5",'i':"7",'w':"6",'r':"8",'c':"1"}
targets = []
for i in xrange(1,9):
        target = r"http://www.88181.com/proxy%d.html" % i
        targets.append(target)
#print targets

#抓取代理服務器正則
p = re.compile(r'''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''')

#獲取代理的類
class ProxyGet(threading.Thread):
    def __init__(self,target):
        threading.Thread.__init__(self)
        self.target = target

    def getProxy(self):
        print "代理服務器目標網站: " + self.target
        req = urllib2.urlopen(self.target)
        result = req.read()
        #print chardet.detect(result)
        matchs = p.findall(result)
        for row in matchs:
            ip=row[0]
            port =row[1]
            port = map(lambda x:portdicts[x],port.split('+'))
            port = ''.join(port)
            agent = row[2]
            addr = row[3].decode("cp936").encode("utf-8")
            proxy = [ip,port,addr]
            #print proxy
            rawProxyList.append(proxy)

    def run(self):
        self.getProxy()

#檢驗代理的類
class ProxyCheck(threading.Thread):
    def __init__(self,proxyList):
        threading.Thread.__init__(self)
        self.proxyList = proxyList
        self.timeout = 5
        self.testUrl = "http://www.baidu.com/"
        self.testStr = "030173"

    def checkProxy(self):
        cookies = urllib2.HTTPCookieProcessor()
        for proxy in self.proxyList:
            proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s:%s' %(proxy[0],proxy[1])})
            #print r'http://%s:%s' %(proxy[0],proxy[1])
            opener = urllib2.build_opener(cookies,proxyHandler)
            opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
            #urllib2.install_opener(opener)
            t1 = time.time()

            try:
                #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
                req = opener.open(self.testUrl, timeout=self.timeout)
                #print "urlopen is ok...."
                result = req.read()
                #print "read html...."
                timeused = time.time() - t1
                pos = result.find(self.testStr)
                #print "pos is %s" %pos

                if pos > 1:
                    checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
                    #print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
                else:
                    continue
            except Exception,e:
                #print e.message
                continue

    def run(self):
        self.checkProxy()

#獲取圖片地址函數
def imgurlList(url_home):
    global imgurl_list
    home_page = urllib2.urlopen(url_home)
    url_re = re.compile(r'<li><a href="(.+?)" target="_blank" rel="nofollow">')
    pic_re = re.compile(r'<img src="(.*?\.\w{3,4})"')
    url_list = re.findall(url_re,home_page.read())
    for url in url_list:
        #print url_home+url
        url_page = urllib2.urlopen(url_home+url)
        for imgurlList in re.findall(pic_re,url_page.read()):
            imgurl_list.append(imgurlList)

#下載圖片的類
class getPic(threading.Thread):
    def __init__(self,imgurl_list):
        threading.Thread.__init__(self)
        self.imgurl_list = imgurl_list
        self.timeout = 5
    def downloadimg(self):
        for imgurl in self.imgurl_list:
            pic_suffix = imgurl.split('.')[-1] #獲取圖片後綴
            pic_name = str(random.randint(0,10000000000))+'.'+pic_suffix
            cookies = urllib2.HTTPCookieProcessor()
            randomCheckedProxy = random.choice(checkedProxyList) #隨機取一組代理服務器
            proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s:%s' %(randomCheckedProxy[0],randomCheckedProxy[1])})
            opener = urllib2.build_opener(cookies,proxyHandler)
            opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
            urllib2.install_opener(opener)
            try:
                data_img = opener.open(imgurl,timeout=self.timeout)
                f = open (pic_name,'wb')
                f.write(data_img.read())
                f.close()
            except:
                continue
    def run(self):
        self.downloadimg()

if __name__ == "__main__":
    getThreads = []
    checkThreads = []
    imgurlList('http://www.ivsky.com')
    getPicThreads = []

#對每個目標網站開啟一個線程負責抓取代理
for i in range(len(targets)):
    t = ProxyGet(targets[i])
    getThreads.append(t)

for i in range(len(getThreads)):
    getThreads[i].start()

for i in range(len(getThreads)):
    getThreads[i].join()

print '.'*10+"總共抓取了%s個代理" %len(rawProxyList) +'.'*10

#開啟20個線程負責校驗,將抓取到的代理分成20份,每個線程校驗一份
for i in range(20):
    t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
    checkThreads.append(t)

for i in range(len(checkThreads)):
    checkThreads[i].start()

for i in range(len(checkThreads)):
    checkThreads[i].join()

print '.'*10+"總共有%s個代理通過校驗" %len(checkedProxyList) +'.'*10

#開啟20個線程隨機取一個代理下載圖片
for i in range(20):
    t = getPic(imgurl_list[((len(imgurl_list)+19)/20) * i:((len(imgurl_list)+19)/20) * (i+1)])
    getPicThreads.append(t)

for i in range(len(getPicThreads)):
    getPicThreads[i].start()

for i in range(len(getPicThreads)):
    getPicThreads[i].join()

print '.'*10+"總共有%s個圖片下載" %len(imgurl_list) +'.'*10

#代理排序持久化
f= open("proxy_list.txt",'w+')
for proxy in sorted(checkedProxyList,cmp=lambda x,y:cmp(x[3],y[3])):
    #print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
    f.write("%s:%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3]))
f.close()

測試結果:
# ls
proxy_getpic.py
# python proxy_getpic.py
代理服務器目標網站: http://www.88181.com/proxy1.html
代理服務器目標網站: http://www.88181.com/proxy2.html
代理服務器目標網站: http://www.88181.com/proxy3.html
代理服務器目標網站: http://www.88181.com/proxy4.html
代理服務器目標網站: http://www.88181.com/proxy5.html
代理服務器目標網站: http://www.88181.com/proxy6.html
代理服務器目標網站: http://www.88181.com/proxy7.html
代理服務器目標網站: http://www.88181.com/proxy8.html
..........總共抓取了800個代理..........
..........總共有458個代理通過校驗..........
..........總共有154個圖片下載..........
# cat proxy_list.txt | more
173.213.113.111:3128    United States  0.432188987732
173.213.113.111:8089    United States  0.441318035126
173.213.113.111:7808    United States  0.444597005844
110.4.24.170:80 香港 香港移動通訊有限公司      0.489440202713
211.142.236.135:8080    湖南省株洲市 移動      0.490673780441
211.142.236.135:8081    湖南省株洲市 移動      0.518096923828
211.142.236.135:8000    湖南省株洲市 移動      0.51860499382
211.142.236.135:8082    湖南省株洲市 移動      0.520448207855
# ls
1001117689.jpg  3097883176.jpg  5234319709.jpg  7012274766.jpg  8504924248.jpg
1076458640.jpg  3144369522.jpg  5387877704.jpg  7106183143.jpg  867723868.jpg
1198548712.jpg  3161307031.jpg  5572092752.jpg  7361254661.jpg  8746315373.jpg
165738192.jpg  3228008315.jpg  5575388077.jpg  7389537793.jpg  8848973192.jpg
1704512138.jpg  3306931164.jpg  5610740708.jpg  7407358698.jpg  8973834958.jpg
1742167711.jpg  3320152673.jpg  5717429022.jpg  7561176207.jpg  8976862152.jpg
...............

Copyright © Linux教程網 All Rights Reserved