Python3 多線程下載代碼,貌似原版源自Axel這個多線程下載工具。
'''
Created on 2014-10-24
@author: Maple
'''
import sys
import os
import time
import getopt
import urllib.request
import urllib.parse
from threading import Thread
#===============================================================================
# def download(url, output=os.getcwd(), blocks=6, proxies=local_proxies)
# output:輸出文件路徑,默認為當前路徑
# blocks:線程數
# proxies:代理地址
#===============================================================================
local_proxies = {}#代理地址
class Maple(Thread):
version = "Mozilla/5.0"
def __init__(self, threadname, url, filename, ranges=0, proxies={}):
Thread.__init__(self, name=threadname)
self.name = threadname
self.url = url
self.proxies = proxies
self.filename = filename
self.ranges = ranges
self.downloaded = 0
def run(self):
try:
self.downloaded = os.path.getsize( self.filename ) #獲取已下載的文件字節塊塊,支持斷點續傳
except OSError:
#print 'never downloaded'
self.downloaded = 0
opener=GetUrlOpener(self.proxies) #根據代理參數生成相應的url opener
if self.ranges: #ranges為線程需要下載的文件塊的字節范圍
# rebuild start poind
self.startpoint = self.ranges[0] + self.downloaded #從已下載字節塊後的位置開始下載
# This part is completed
if self.startpoint >= self.ranges[1]:
self.downloaded = self.ranges[1] - self.ranges[0]
print ('Part %s has been downloaded over.' % self.filename)
return
opener.addheaders=[('Range','bytes={}-{}'.format(self.startpoint, self.ranges[1])),('User-agent','Mozilla/5.0')] #添加請求頭部內容,僅下載指定范圍的字節,偽裝成浏覽器請求
print ('task %s will download from %d to %d' % (self.name, self.startpoint+1, self.ranges[1]+1))
else: #ranges未指定(文件大小未知,無法切割),從已下載字節塊後的位置開始下載剩余全部字節
self.startpoint = self.downloaded
opener.addheaders=[('Range','bytes={}-'.format(self.startpoint)),('User-agent','Mozilla/5.0')]
self.fetchsize = 16384 #每次讀取的字節數
self.urlhandle = opener.open(self.url) #打開文件地址
data = self.urlhandle.read( self.fetchsize )
while data: #循環讀取數據寫入臨時文件,並更新已下載字節數
filehandle = open( self.filename, 'ab+' )
filehandle.write( data )
filehandle.close()
self.downloaded += len( data )
data = self.urlhandle.read( self.fetchsize )
def Sec2Time(second): #將秒數轉換為標准時間格式。以為有現成的函數,結果愣是沒找到
day=second//(3600*24)
second-=day*3600*24
hour=second//3600
second -=hour*3600
minute=second//60
second-=minute*60
if day == 0:
if hour == 0:
if minute == 0:
return '{:0.2f}S.'.format(second)
else:
return '{:02}M:{:0.2f}S'.format(minute,second)
else:
return '{:02}H:{:02}M:{:0.2f}S'.format(hour,minute,second)
else:
return '{:03}D:{:02}H:{:02}M:{:0.2f}S'.format(day,hour,minute,second)
def GetUrlOpener(proxies={}): #分析代理參數,返回url opener。完整代理格式:user/passwd@http://127.0.0.1:8087。如格式不同,需要修改此分析函數
if proxies:
try:
ap=proxies.split('@')
if len(ap) > 1:
auth=ap[0]
addr=ap[1]
else:
addr=ap[0]
auth=''
if '://' in addr:
ptype=addr[:addr.find('://')]
phost=addr[addr.find('://')+3:]
else:
ptype='http'
phost=addr
proxy={ptype:ptype+'://'+phost}
proxy_handler = urllib.request.ProxyHandler(proxy)
except Exception as ex:
print(ex)
return urllib.request.build_opener()
try:
authlist=auth.split('/')
if len(authlist) > 1:
user=authlist[0]
passwd=authlist[1]
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler()
proxy_auth_handler.add_password('realm',phost,user,passwd)
opener = urllib.request.build_opener(proxy_handler,proxy_auth_handler)
else:
opener = urllib.request.build_opener(proxy_handler)
return opener
except Exception as ex:
print(ex)
return urllib.request.build_opener(proxy_handler)
else:
# urlHandler=urllib.request.urlopen(url)
return urllib.request.build_opener()
def GetUrlFileInfo(url,proxies={}): #獲取要下載的文件的信息,包括文件名,文件類型和文件大小
scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url) #分析url
filename=urllib.parse.unquote(path) #如果url中的文件名部分存在中文,將其正確解碼出來
filename=filename.split('/')[-1]
opener=GetUrlOpener(proxies) #通過網絡請求讀取響應頭部,根據頭部獲取文件信息。文件名以服務器返回的文件名信息為准
urlHandler=opener.open(url)
headers=urlHandler.info()
if 'Content-Disposition' in headers: #Content-Disposition字段有可能獲取到文件名,不過可能是亂碼,沒找到解決辦法</span>
disposition=headers.get('Content-Disposition')
if 'filename=' in disposition:
filename = disposition.split('filename=')[1]
if filename[0] == '"' or filename[0] == "'":
filename = filename[1:-1]
filename=urllib.parse.unquote(filename)
if filename:
(name,ext)=os.path.splitext(filename)
else:
(name,ext)=('Unknown','')
if 'Content-Length' in headers: #獲取文件長度,如果獲取失敗,則只能使用單線程下載
length=int(headers.get('Content-Length'))
else:
length=-1
(type, kind)=headers.get('Content-Type').split('/') #獲取文件類型,備用
infos=[(name,ext),(type,kind),length]
return infos
def SpliteBlocks(totalsize, blocknumber): #根據指定的線程數參數和獲取到的文件長度劃分各線程的下載范圍
blocksize = totalsize//blocknumber
ranges = []
for i in range(0, blocknumber-1):
ranges.append((i*blocksize, i*blocksize +blocksize - 1))
ranges.append(( blocksize*(blocknumber-1), totalsize -1 ))
return ranges
def islive(tasks): #檢查各線程是否全部下載完成
for task in tasks:
if task.isAlive():
return True
return False
def download(url, target=os.getcwd(), blocks=6, proxies=local_proxies):
flag=True
print('Retrieving resource information...')
url=urllib.parse.quote(url,safe='/%&@=+?$;,:') #將提供的url編碼,非英文字符將被編碼為標准格式
try:
infos=GetUrlFileInfo(url,proxies) #獲取文件信息
except Exception as ex:
print(ex)
flag=False
if flag:
if not os.path.exists(target):
os.makedirs(target)
size=infos[2] #獲取到的文件大小
output=os.path.join(target,''.join(infos[0])) #根據獲取到的文件名和指定的保存目錄生成完整路徑
type=infos[1][0]
starttime=time.time() #開始計時
print('Infomation:')
print('FileName:{0} FileType:{1} FileLength:{2}'.format(''.join(infos[0]),'/'.join(infos[1]),infos[2] if int(infos[2]) > 0 else 'Unknown')) #打印獲取到的文件信息
if size > 0: #size大於0表示成功獲取文件長度,可以進行多線程下載
print('Starting multithread download...')
ranges = SpliteBlocks( size, blocks )
else: #只能單線程下載,線程數置1,ranges置空,
print('Starting single thread download...')
ranges=()
blocks=1
threadname = [ infos[0][0]+"_thread_%d" % i for i in range(0, blocks) ] #生成線程名
filename = [ infos[0][0]+ "_tmpfile_%d" % i for i in range(0, blocks) ] #生成各線程的臨時文件名
tasks = []
for i in range(0,blocks): #生成下載線程,設置為後台線程後啟動,將線程加入到線程列表中
task = Maple( threadname[i], url, filename[i], ranges[i] if ranges else ranges,proxies)
task.setDaemon( True )
task.start()
tasks.append( task )
time.sleep( 1 )
downloaded = 0
while islive(tasks): #統計線程列表中各線程的狀態,輸出下載進度
downloaded = sum( [task.downloaded for task in tasks] )
if size > 0:
process = downloaded/float(size)*100
show = '\rFilesize:%d Downloaded:%d Completed:%.2f%%' % (size, downloaded, process)
else:
show = '\rDownloaded:%d ' % downloaded
sys.stdout.write(show)
sys.stdout.flush()
time.sleep( 0.2 )
endtime=time.time() #下載完成後停止計時
consuming=Sec2Time(endtime-starttime)
if size > 0: #多線程下載的後續處理
downloadsize = 0
for i in filename:
downloadsize += os.path.getsize(i)
if downloadsize == size:
show = '\rFilesize:%d Downloaded:%d Completed:%.2f%%\n' % (size, downloadsize,100)
else:
show = '\nSize is not mathed!\n'
flag=False
else: #單線程下載的後續處理
show = '\nTotal Size: %d\n'% downloaded
sys.stdout.write(show)
sys.stdout.flush()
if flag: #確認下載的臨時文件沒問題後將各文件整合為最終的目標文件
print('Integrating files...')
num=1
while os.path.exists(output): #防止與本地已存在文件重名
fname,fext=os.path.splitext(output)
if '('+str(num-1)+')'+fext in output:
output = output.replace('('+str(num-1)+')'+fext,'('+str(num)+')'+fext)
else:
fname += '('+str(num)+')'
output = fname+fext
num +=1
if len(filename) ==1 : #單線程下載的話,直接將下載的文件重命名為目標文件即可
os.rename(filename[0], output)
else: #多線程臨時文件整合
filehandle = open( output, 'wb+' )
for i in filename:
try:
f = open( i, 'rb' )
filehandle.write( f.read() )
f.close()
os.remove(i)
except Exception as ex:
print(ex)
filehandle.close()
if os.path.exists(output):
print('Download Complete!')
else:
print('Failed to generate target file!')
try:
#os.remove(output)
pass
except:
pass
else:
for i in filename:
try:
os.remove(i)
pass
except:
pass
print('Download Failed!')
pass
print('Consuming: {}\n'.format(consuming)) #輸出耗時
else:
print('Failed to retrieve resource information!')
sys.exit()
def main(argv): #處理傳入參數,使用了getopt模塊,另外有一個更強大的處理傳入參數的模塊optparse
try:
options,args=getopt.getopt(argv,'hu:f:n:p:',['help','url=','target=','num=','proxy='])
except Exception as ex:
print(ex)
sys.exit()
num = 2
url,target,proxies= '','',''
url = 'http://www.pygtk.org/dist/pygtk2-tut.pdf'
target = '/home/maple/Desktop'
#proxies = 'http://127.0.0.1:8087'
#proxies={}
for name, value in options:
if name in ('-h','--help'):
print('No Help ^^')
sys.exit()
if name in ('-u','--url'):
url = value
if name in ('-t','--target'):
target = value
if name in ('-n','--num'):
num = int(value)
if name in ('-p','--proxy'):
proxies=value
#check args
download(url,target,num,proxies)
if __name__ == '__main__':
main(sys.argv[1:])
這段代碼在異常處理方面寫得有些亂,沒怎麼關心異常處理,需要時再改吧
另外多線程下載時,如果使用了代理,會導致下載到的文件與服務器提供的文件大小不符。從而下載失敗。我使用的是GoAgent,代理服務器會自動對目標文件進行多線程下載,無視程序指定的下載字節范圍。第一個線程就會下載到完整的文件,其他線程會下載冗余內容。沒有找到規范的處理辦法。變通的處理辦法有2種:
1、將使用了代理的多線程下載強制指定為單線程下載
2、不進行文件大小的校驗,將多線程下載的0號臨時文件重命名為目標文件,其他臨時文件刪除。
兩種方法實現都很簡單,但是破壞代碼的整體邏輯。沒有加入代碼中。運行截圖:
下面關於Python的文章您也可能喜歡,不妨看看:
Linux下Python的安裝以及注意事項 http://www.linuxidc.com/Linux/2015-11/124861.htm
Ubuntu 14.04 下安裝使用Python rq模塊 http://www.linuxidc.com/Linux/2015-08/122441.htm
無需操作系統直接運行 Python 代碼 http://www.linuxidc.com/Linux/2015-05/117357.htm
CentOS上源碼安裝Python3.4 http://www.linuxidc.com/Linux/2015-01/111870.htm
《Python核心編程 第二版》.(Wesley J. Chun ).[高清PDF中文版] http://www.linuxidc.com/Linux/2013-06/85425.htm
《Python開發技術詳解》.( 周偉,宗傑).[高清PDF掃描版+隨書視頻+代碼] http://www.linuxidc.com/Linux/2013-11/92693.htm
Python腳本獲取Linux系統信息 http://www.linuxidc.com/Linux/2013-08/88531.htm
在Ubuntu下用Python搭建桌面算法交易研究環境 http://www.linuxidc.com/Linux/2013-11/92534.htm
Python 語言的發展簡史 http://www.linuxidc.com/Linux/2014-09/107206.htm
Python 的詳細介紹:請點這裡
Python 的下載地址:請點這裡