split.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # -*-: coding:utf-8 -*-
  2. import csv
  3. import re
  4. def split():
  5. INPUT_DATA = r"./data/agg_filter.csv"
  6. OUTPUT_TEMP = "./data/split/agg_split_%d.txt"
  7. OUTPUT_TEMP2 = "./data/split/agg_split_%d_%d.txt"
  8. startPattern = re.compile("######开始######")
  9. con_l = []
  10. sub = None
  11. with open(INPUT_DATA, "r", encoding="GBK") as fr:
  12. for line in fr.readlines():
  13. tl = startPattern.findall(line)
  14. if len(tl) > 0:
  15. sub = []
  16. sub.append(line)
  17. con_l.append(sub)
  18. elif line.startswith("\n"):
  19. continue
  20. else:
  21. sub.append(line)
  22. # step = 71500
  23. # for i, v in enumerate(range(0, len(con_l), step)):
  24. # with open(OUTPUT_TEMP % (i+1), "w", encoding="GBK") as fw:
  25. # for ele in con_l[v:v+step]:
  26. # if len(ele) == 1:
  27. # continue
  28. # fw.write("\n")
  29. # for content in ele:
  30. # fw.write(content)
  31. filter_l = [
  32. (1000, 1000, [subList for subList in con_l if len(subList)>=1000]),
  33. (500, 1000, [subList for subList in con_l if len(subList)>=500 and len(subList) < 1000]),
  34. (100,500,[subList for subList in con_l if len(subList)>=100 and len(subList) < 500]),
  35. (50,100,[subList for subList in con_l if len(subList)>=50 and len(subList)<100]),
  36. (10,50,[subList for subList in con_l if len(subList)>=10 and len(subList)<50]),
  37. (5,10,[subList for subList in con_l if len(subList)>=5 and len(subList)<10]),
  38. (3,5,[subList for subList in con_l if len(subList)>=3 and len(subList)<5]),
  39. (2,2,[subList for subList in con_l if len(subList)==2])
  40. # (1,1,[subList for subList in con_l if len(subList)==1])
  41. ]
  42. for start, end, sublist in filter_l:
  43. with open(OUTPUT_TEMP2 % (start, end), "w", encoding="GBK") as fw:
  44. for ele in sublist:
  45. fw.write("\n")
  46. for content in ele:
  47. fw.write(content)
  48. if __name__ == '__main__':
  49. split()