| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- # -*-: coding:utf-8 -*-
- import csv
- import re
- def filter3():
- INPUT_DATA = r"./data/agg_filter3.csv"
- OUTPUT_TEMP = "./data/agg_filter4.csv"
- startPattern = re.compile("######开始######")
- keyPattern = re.compile("赚钱")
- total = []
- sub = None
- with open(INPUT_DATA, "r", encoding="GBK") as fr,\
- open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
- for line in fr.readlines():
-
- tl = startPattern.findall(line)
- if len(tl) > 0:
- sub = []
- sub.append(line)
- total.append(sub)
- elif line.startswith("\n"):
- continue
- else:
- kl = keyPattern.findall(line)
- if len(kl)>0:
- sub.append(line)
-
- sortedList = sorted(total, key=lambda x:len(x), reverse=True)
- fw.write("统计信息")
- fw.write("%s%d\n" % ("总数:", len(sortedList)))
- fw.write("%s%d\n" %("大于等于1000:", len([subList for subList in sortedList if len(subList)>=1000])))
- fw.write("%s%d\n" %("大于等于500小于1000:", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
- fw.write("%s%d\n" %("大于等于100小于500:", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
- fw.write("%s%d\n" %("大于等于50小于100:", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
- fw.write("%s%d\n" %("大于等于10小于50:", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
- fw.write("%s%d\n" %("大于等于5小于10:", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
- fw.write("%s%d\n" %("大于等于3小于5:", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
- fw.write("%s%d\n" %("等于2:", len([subList for subList in sortedList if len(subList)==2])))
- fw.write("%s%d\n" %("等于1:", len([subList for subList in sortedList if len(subList)==1])))
-
- for subList in sortedList:
- if len(subList) == 1:
- continue
- fw.write("\n")
- for line in subList:
- fw.write(line)
- def filter2():
- INPUT_DATA = r"./data/agg_filter.csv"
- OUTPUT_TEMP = "./data/agg_filter3.csv"
- startPattern = re.compile("######开始######")
- total = []
- sub = None
- with open(INPUT_DATA, "r", encoding="GBK") as fr,\
- open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
- for line in fr.readlines():
-
- tl = startPattern.findall(line)
- if len(tl) > 0:
- sub = []
- sub.append(line)
- total.append(sub)
- elif line.startswith("\n"):
- continue
- else:
- sub.append(line)
-
- sortedList = sorted(total, key=lambda x:len(x), reverse=True)
- fw.write("统计信息")
- fw.write("%s%d\n" % ("总数:", len(sortedList)))
- fw.write("%s%d\n" %("大于等于1000:", len([subList for subList in sortedList if len(subList)>=1000])))
- fw.write("%s%d\n" %("大于等于500小于1000:", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
- fw.write("%s%d\n" %("大于等于100小于500:", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
- fw.write("%s%d\n" %("大于等于50小于100:", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
- fw.write("%s%d\n" %("大于等于10小于50:", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
- fw.write("%s%d\n" %("大于等于5小于10:", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
- fw.write("%s%d\n" %("大于等于3小于5:", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
- fw.write("%s%d\n" %("等于2:", len([subList for subList in sortedList if len(subList)==2])))
- fw.write("%s%d\n" %("等于1:", len([subList for subList in sortedList if len(subList)==1])))
- for subList in sortedList:
- if len(subList) == 1:
- continue
- fw.write("\n")
- for line in subList:
- fw.write(line)
-
- def filter1():
- # INPUT_DATA = r"E:\Documents\Code\LongTailKeyDataMining\agg.csv"
- INPUT_DATA = r"./data/agg_filter.csv"
- OUTPUT_TEMP = "./data/agg_filter2.csv"
- filterPattern = []
- with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
- filterSet = set();
- while True:
- lineContent = f_filter.readline().replace("\n","").replace("\r","")
- if not lineContent:
- break
-
- filterSet.add(lineContent)
- for r in filterSet:
- filterPattern.append(re.compile(r))
- with open(INPUT_DATA, "r", encoding="GBK") as fr,\
- open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
- for line in fr.readlines():
- writeFlag = True
- for p in filterPattern:
- l = p.findall(line)
- if len(l) > 0:
- writeFlag = False
- break
-
- if writeFlag:
- fw.write(line)
- if __name__ == '__main__':
- filter3()
|