filter.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # -*-: coding:utf-8 -*-
  2. import csv
  3. import re
  4. def filter3():
  5. INPUT_DATA = r"./data/agg_filter3.csv"
  6. OUTPUT_TEMP = "./data/agg_filter4.csv"
  7. startPattern = re.compile("######开始######")
  8. keyPattern = re.compile("赚钱")
  9. total = []
  10. sub = None
  11. with open(INPUT_DATA, "r", encoding="GBK") as fr,\
  12. open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
  13. for line in fr.readlines():
  14. tl = startPattern.findall(line)
  15. if len(tl) > 0:
  16. sub = []
  17. sub.append(line)
  18. total.append(sub)
  19. elif line.startswith("\n"):
  20. continue
  21. else:
  22. kl = keyPattern.findall(line)
  23. if len(kl)>0:
  24. sub.append(line)
  25. sortedList = sorted(total, key=lambda x:len(x), reverse=True)
  26. fw.write("统计信息")
  27. fw.write("%s%d\n" % ("总数:", len(sortedList)))
  28. fw.write("%s%d\n" %("大于等于1000:", len([subList for subList in sortedList if len(subList)>=1000])))
  29. fw.write("%s%d\n" %("大于等于500小于1000:", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
  30. fw.write("%s%d\n" %("大于等于100小于500:", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
  31. fw.write("%s%d\n" %("大于等于50小于100:", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
  32. fw.write("%s%d\n" %("大于等于10小于50:", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
  33. fw.write("%s%d\n" %("大于等于5小于10:", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
  34. fw.write("%s%d\n" %("大于等于3小于5:", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
  35. fw.write("%s%d\n" %("等于2:", len([subList for subList in sortedList if len(subList)==2])))
  36. fw.write("%s%d\n" %("等于1:", len([subList for subList in sortedList if len(subList)==1])))
  37. for subList in sortedList:
  38. if len(subList) == 1:
  39. continue
  40. fw.write("\n")
  41. for line in subList:
  42. fw.write(line)
  43. def filter2():
  44. INPUT_DATA = r"./data/agg_filter.csv"
  45. OUTPUT_TEMP = "./data/agg_filter3.csv"
  46. startPattern = re.compile("######开始######")
  47. total = []
  48. sub = None
  49. with open(INPUT_DATA, "r", encoding="GBK") as fr,\
  50. open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
  51. for line in fr.readlines():
  52. tl = startPattern.findall(line)
  53. if len(tl) > 0:
  54. sub = []
  55. sub.append(line)
  56. total.append(sub)
  57. elif line.startswith("\n"):
  58. continue
  59. else:
  60. sub.append(line)
  61. sortedList = sorted(total, key=lambda x:len(x), reverse=True)
  62. fw.write("统计信息")
  63. fw.write("%s%d\n" % ("总数:", len(sortedList)))
  64. fw.write("%s%d\n" %("大于等于1000:", len([subList for subList in sortedList if len(subList)>=1000])))
  65. fw.write("%s%d\n" %("大于等于500小于1000:", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
  66. fw.write("%s%d\n" %("大于等于100小于500:", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
  67. fw.write("%s%d\n" %("大于等于50小于100:", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
  68. fw.write("%s%d\n" %("大于等于10小于50:", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
  69. fw.write("%s%d\n" %("大于等于5小于10:", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
  70. fw.write("%s%d\n" %("大于等于3小于5:", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
  71. fw.write("%s%d\n" %("等于2:", len([subList for subList in sortedList if len(subList)==2])))
  72. fw.write("%s%d\n" %("等于1:", len([subList for subList in sortedList if len(subList)==1])))
  73. for subList in sortedList:
  74. if len(subList) == 1:
  75. continue
  76. fw.write("\n")
  77. for line in subList:
  78. fw.write(line)
  79. def filter1():
  80. # INPUT_DATA = r"E:\Documents\Code\LongTailKeyDataMining\agg.csv"
  81. INPUT_DATA = r"./data/agg_filter.csv"
  82. OUTPUT_TEMP = "./data/agg_filter2.csv"
  83. filterPattern = []
  84. with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
  85. filterSet = set();
  86. while True:
  87. lineContent = f_filter.readline().replace("\n","").replace("\r","")
  88. if not lineContent:
  89. break
  90. filterSet.add(lineContent)
  91. for r in filterSet:
  92. filterPattern.append(re.compile(r))
  93. with open(INPUT_DATA, "r", encoding="GBK") as fr,\
  94. open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
  95. for line in fr.readlines():
  96. writeFlag = True
  97. for p in filterPattern:
  98. l = p.findall(line)
  99. if len(l) > 0:
  100. writeFlag = False
  101. break
  102. if writeFlag:
  103. fw.write(line)
  104. if __name__ == '__main__':
  105. filter3()