| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- # -*-: coding:utf-8 -*-
- import csv
- import re
- def split():
- INPUT_DATA = r"./data/agg_filter.csv"
- OUTPUT_TEMP = "./data/split/agg_split_%d.txt"
- OUTPUT_TEMP2 = "./data/split/agg_split_%d_%d.txt"
- startPattern = re.compile("######开始######")
- con_l = []
- sub = None
- with open(INPUT_DATA, "r", encoding="GBK") as fr:
- for line in fr.readlines():
- tl = startPattern.findall(line)
- if len(tl) > 0:
- sub = []
- sub.append(line)
- con_l.append(sub)
- elif line.startswith("\n"):
- continue
- else:
- sub.append(line)
-
- # step = 71500
- # for i, v in enumerate(range(0, len(con_l), step)):
- # with open(OUTPUT_TEMP % (i+1), "w", encoding="GBK") as fw:
- # for ele in con_l[v:v+step]:
- # if len(ele) == 1:
- # continue
- # fw.write("\n")
- # for content in ele:
- # fw.write(content)
- filter_l = [
- (1000, 1000, [subList for subList in con_l if len(subList)>=1000]),
- (500, 1000, [subList for subList in con_l if len(subList)>=500 and len(subList) < 1000]),
- (100,500,[subList for subList in con_l if len(subList)>=100 and len(subList) < 500]),
- (50,100,[subList for subList in con_l if len(subList)>=50 and len(subList)<100]),
- (10,50,[subList for subList in con_l if len(subList)>=10 and len(subList)<50]),
- (5,10,[subList for subList in con_l if len(subList)>=5 and len(subList)<10]),
- (3,5,[subList for subList in con_l if len(subList)>=3 and len(subList)<5]),
- (2,2,[subList for subList in con_l if len(subList)==2])
- # (1,1,[subList for subList in con_l if len(subList)==1])
- ]
- for start, end, sublist in filter_l:
- with open(OUTPUT_TEMP2 % (start, end), "w", encoding="GBK") as fw:
- for ele in sublist:
- fw.write("\n")
- for content in ele:
- fw.write(content)
-
-
-
- if __name__ == '__main__':
- split()
|