需求:在進行Hadoop測試時,需要造大量數據,例如某個表存在56列,但實際程序邏輯只適用到某幾列,我們造的數據 也只需要某幾列
構造幾列數據,轉化為對應數據表格式
涉及模塊:os,getopt,sys
輸入:源格式,文本文件
輸出:目標格式,文本文件
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- #dataformat.py
- #this script change data from your source to the dest data format
- #2011-08-05 created version0.1
- #2011-10-29 add row-row mapping ,default row value .rebuild all functions. version0.2
- #next:add data auto generate by re expression
-
- import os,getopt,sys
-
- #讀入文件,返回所有行
- def read_file(path):
- f = open(path, "r")
- lines = f.readlines()
- f.close()
- return lines
-
- #處理一行,轉為目標格式,返回目標行
- def one_line_proc(parts, total, ft_map, outsp, empty_fill):
- toindex = 0
- outline = ""
- keys = ft_map.keys()
- for i in range(1, total+1):
- if i in keys:
- fill_index = ft_map[i]
- if fill_index.startswith("d"):
- outline += fill_index[1:]
- else:
- outline += parts[int(fill_index)-1]
- else:
- outline += empty_fill
- if i !=total:
- outline += outsp
- #TODO:加入使用默認值列 若是以d開頭,後面是默認,否則取文件對應列 done
- #TODO:這裡根據這個判斷長度也需要換掉 done
- return outline
-
- #處理入口,讀文件,循環處理每一行,寫出
- #輸入數據分隔符默認\t,輸出數據默認分隔符\t
- def process(inpath, total, to, outpath, insp="\t", outsp="\t", empty_fill=""):
- #TODO:這裡將to轉為映射格式 done
- ft_map = {}
- in_count = 0
- used_row = []
- for to_row in to:
- if r"\:" not in to_row and len(to_row.split(":"))==2:
- used_row.append(int(to_row.split(":")[1]))
- if r"\=" not in str(to_row) and len(str(to_row).split("="))==2:
- pass
- else:
- in_count += 1
-
- for to_row in to:
- if r"\=" not in str(to_row) and len(str(to_row).split("="))==2:
- ft_map.update({int(to_row.split("=")[0]):"d"+to_row.split("=")[1]})
- continue
- elif r"\:" not in to_row and len(to_row.split(":"))==2:
- ft_map.update({int(to_row.split(":")[0]):to_row.split(":")[1]})
- continue
- else:
- to_index = 0
- for i in range(1, 100):
- if i not in used_row:
- to_index = i
- break
- ft_map.update({int(to_row):str(to_index)})
- used_row.append(to_index)
-
- lines = read_file(inpath)
- f = open(outpath,"w")
- result=[]
- for line in lines:
- parts = line.strip("\n").split(insp)
- #TODO:這裡判斷長度必須換掉 done
- if len(parts) >= in_count:
- outline = one_line_proc(parts, total, ft_map, outsp, empty_fill)
- result.append(outline+"\n")
- f.writelines(result)
- f.close()
-
- #打印幫助信息
- def help_msg():
- print("功能:原數據文件轉為目標數據格式")
- print("選項:")
- print("\t -i inputfilepath [必輸,原文件路徑]")
- print("\t -t n [必輸,n為數字,目標數據總的域個數]")
- print("\t -a '1,3,4' [必輸,域編號字符串,逗號分隔。指定域用原數據字段填充,未指定用'0'填充]")
- print("\t -o outputfilepath [可選,默認為 inputfilepath.dist ]")
- print("\t -F 'FS' [可選,原文件域分隔符,默認為\\t ]")
- print("\t -P 'OFS' [可選,輸出文件的域分隔符,默認為\\t ]")
- sys.exit(0)
-
- #程序入口,讀入參數,執行
- def main():
- try:
- opts,args = getopt.getopt(sys.argv[1:],"F:P:t:a:i:o:f:h")
-
- for op,value in opts:
- if op in ("-h","-H","--help"):
- help_msg()
- if op == "-i":
- inpath = value
- elif op == "-o":
- outpath = value
- elif op == "-t":
- total = int(value)
- elif op == "-a":
- to = value.split(",")
- elif op == "-F":
- insp = value.decode("string_escape")
- elif op == "-P":
- outsp = value.decode("string_escape")
- elif op == "-f":
- empty_fill = value
- #考慮下這邊放在神馬地方合適
- if len(opts) < 3:
- print(sys.argv[0]+" : the amount of params must great equal than 3")
- sys.exit(1)
-
- except getopt.GetoptError:
- print(sys.argv[0]+" : params are not defined well!")
-
- if 'inpath' not in dir():
- print(sys.argv[0]+" : -i param is needed,input file path must define!")
- sys.exit(1)
-
- if 'total' not in dir():
- print(sys.argv[0]+" : -t param is needed,the fields of result file must define!")
- sys.exit(1)
-
- if 'to' not in dir():
- print(sys.argv[0]+" : -a param is needed,must assign the field to put !")
- sys.exit(1)
-
- if not os.path.exists(inpath):
- print(sys.argv[0]+" file : %s is not exists"%inpath)
- sys.exit(1)
-
- if 'empty_fill' not in dir():
- empty_fill = ''
-
- tmp=[]
- for st in to:
- tmp.append(str(st))
- to=tmp
-
- if 'outpath' not in dir():
- outpath = inpath+".dist"
-
- if 'insp' in dir() and 'outsp' in dir():
- process(inpath,total,to,outpath,insp,outsp,empty_fill=empty_fill)
- elif 'insp' in dir():
- process(inpath,total,to,outpath,insp,empty_fill=empty_fill)
- elif 'outsp' in dir():
- process(inpath,total,to,outpath,outsp=outsp,empty_fill=empty_fill)
- else:
- process(inpath,total,to,outpath,empty_fill=empty_fill)
-
- if __name__ =="__main__":
- main()