|
@@ -0,0 +1,139 @@
|
|
|
|
|
+# -*- coding:utf-8 -*-
|
|
|
|
|
+
|
|
|
|
|
+import re
|
|
|
|
|
+import mmap
|
|
|
|
|
+import tools
|
|
|
|
|
+import jieba
|
|
|
|
|
+
|
|
|
|
|
+def transfer_str(num):
|
|
|
|
|
+ msg = None
|
|
|
|
|
+ if num >= 10000:
|
|
|
|
|
+ msg = "%d万%d" % (num//10000, num%10000)
|
|
|
|
|
+ else:
|
|
|
|
|
+ msg = str(num)
|
|
|
|
|
+ return msg
|
|
|
|
|
+
|
|
|
|
|
+def cal(list):
|
|
|
|
|
+ list_len = len(list)
|
|
|
|
|
+ list_count = sum(list)
|
|
|
|
|
+ sum_msg = transfer_str(list_len)
|
|
|
|
|
+ count_msg = transfer_str(list_count)
|
|
|
|
|
+ avg_msg = transfer_str(int(list_count/list_len))
|
|
|
|
|
+ return sum_msg, count_msg, avg_msg
|
|
|
|
|
+
|
|
|
|
|
+def tip(condition, list):
|
|
|
|
|
+ print("条件:%s - 涉及:%s个词根,涉及词数:%s,平均约:%s 词数/词根" % ((condition,)+ cal(list)))
|
|
|
|
|
+
|
|
|
|
|
+def keyStat(fmap: mmap.mmap, keyword:str):
|
|
|
|
|
+ fmap.seek(0)
|
|
|
|
|
+ pattern = re.compile(keyword)
|
|
|
|
|
+ stopWord = tools.load_stop_word()
|
|
|
|
|
+ totalSize = fmap.size()
|
|
|
|
|
+
|
|
|
|
|
+ statDict = {}
|
|
|
|
|
+ while True:
|
|
|
|
|
+ curPos = fmap.tell();
|
|
|
|
|
+ if curPos >= totalSize:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ lineContent = f_mmap.readline().decode("UTF-8")
|
|
|
|
|
+ tmpList = pattern.findall(lineContent)
|
|
|
|
|
+ if tmpList:
|
|
|
|
|
+ cutList = list(jieba.cut_for_search(lineContent.replace("\r","").replace("\n","")))
|
|
|
|
|
+ for cutKeyword in cutList:
|
|
|
|
|
+ if cutKeyword in stopWord:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ count = statDict.get(cutKeyword)
|
|
|
|
|
+ if count:
|
|
|
|
|
+ statDict[cutKeyword]=count+1
|
|
|
|
|
+ else:
|
|
|
|
|
+ statDict[cutKeyword]=1
|
|
|
|
|
+
|
|
|
|
|
+ sorted_key_list = sorted(statDict.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ print("与关键词:%s 相关的词共计:%d" % (keyword, len(sorted_key_list)))
|
|
|
|
|
+
|
|
|
|
|
+ count_list = [ele for ele in statDict.values()]
|
|
|
|
|
+
|
|
|
|
|
+ tip("等于1", [val for val in count_list if val == 1])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于100小于200", [val for val in count_list if val >= 100 and val < 200])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于200小于300", [val for val in count_list if val >= 200 and val < 300])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于5000小于1万", [val for val in count_list if val >= 5000 and val < 10000])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于1万小于5万", [val for val in count_list if val >= 10000 and val < 50000])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于5万小于10万", [val for val in count_list if val >= 50000 and val < 100000])
|
|
|
|
|
+
|
|
|
|
|
+ tip("大于等于10万", [val for val in count_list if val >= 100000])
|
|
|
|
|
+
|
|
|
|
|
+ with open("./data/test/stat_%s.csv" % keyword, "w", encoding="UTF-8") as fw:
|
|
|
|
|
+ for key, count in sorted_key_list:
|
|
|
|
|
+ if count > 1:
|
|
|
|
|
+ fw.write("%s,%d\n" % (key, count))
|
|
|
|
|
+
|
|
|
|
|
+def keyFilter(fmap: mmap.mmap, keyword:str):
|
|
|
|
|
+ fmap.seek(0)
|
|
|
|
|
+ pattern = re.compile(keyword)
|
|
|
|
|
+
|
|
|
|
|
+ totalSize = fmap.size()
|
|
|
|
|
+
|
|
|
|
|
+ with open("./data/test/filter_%s.csv" % keyword, "w", encoding="UTF-8") as fw:
|
|
|
|
|
+ while True:
|
|
|
|
|
+ curPos = fmap.tell();
|
|
|
|
|
+ if curPos >= totalSize:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ lineContent = f_mmap.readline().decode("UTF-8")
|
|
|
|
|
+ tmpList = pattern.findall(lineContent)
|
|
|
|
|
+ if tmpList:
|
|
|
|
|
+ fw.write("%s\n"%lineContent.replace("\r","").replace("\n",""))
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def countKeyword(fmap: mmap.mmap, keywords:set):
|
|
|
|
|
+ for keyword in keywords:
|
|
|
|
|
+ f_mmap.seek(0)
|
|
|
|
|
+ pattern = re.compile(keyword)
|
|
|
|
|
+
|
|
|
|
|
+ count=0
|
|
|
|
|
+
|
|
|
|
|
+ while True:
|
|
|
|
|
+ lineContent = f_mmap.readline().decode("UTF-8")
|
|
|
|
|
+ if not lineContent:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ tmpList = pattern.findall(lineContent)
|
|
|
|
|
+ if tmpList:
|
|
|
|
|
+ count += 1
|
|
|
|
|
+
|
|
|
|
|
+ print("关键词:%s,共出现次数:%d" % (keyword, count))
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+INPUT_FILE = "./data/tmp/merge.csv"
|
|
|
|
|
+
|
|
|
|
|
+with open(INPUT_FILE, "r", encoding="UTF-8") as f, \
|
|
|
|
|
+ mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as f_mmap:
|
|
|
|
|
+
|
|
|
|
|
+ filterSet = set();
|
|
|
|
|
+ with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
|
|
|
|
|
+ while True:
|
|
|
|
|
+ lineContent = f_filter.readline().replace("\n","").replace("\r","")
|
|
|
|
|
+ if not lineContent:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ filterSet.add(lineContent)
|
|
|
|
|
+
|
|
|
|
|
+ countKeyword(f_mmap, filterSet)
|