2 年前 · 0125b31d54
--- a/REMEAD.md
+++ b/REMEAD.md
@@ -1,3 +1,7 @@
 
				+# 待办日志
			
 
				+1. 链式调用
			
 
				+2. 聚合结果分析
			
 
				+
			
 
				 # 处理步骤
			
 
				 
			
 
				 1. 从5118下载泛词（csv文件）
			
--- a/analyse.py
+++ b/analyse.py
@@ -0,0 +1,139 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import re
			
 
				+import mmap
			
 
				+import tools
			
 
				+import jieba
			
 
				+
			
 
				+def transfer_str(num):
			
 
				+    msg = None
			
 
				+    if num >= 10000:
			
 
				+        msg = "%d万%d" % (num//10000, num%10000)
			
 
				+    else:
			
 
				+        msg = str(num)
			
 
				+    return msg
			
 
				+
			
 
				+def cal(list):
			
 
				+    list_len = len(list)
			
 
				+    list_count = sum(list)
			
 
				+    sum_msg = transfer_str(list_len)
			
 
				+    count_msg = transfer_str(list_count)
			
 
				+    avg_msg = transfer_str(int(list_count/list_len))
			
 
				+    return sum_msg, count_msg, avg_msg
			
 
				+
			
 
				+def tip(condition, list):
			
 
				+    print("条件：%s - 涉及：%s个词根，涉及词数：%s，平均约：%s 词数/词根" % ((condition,)+ cal(list)))
			
 
				+
			
 
				+def keyStat(fmap: mmap.mmap, keyword:str):
			
 
				+    fmap.seek(0)
			
 
				+    pattern = re.compile(keyword)
			
 
				+    stopWord = tools.load_stop_word()
			
 
				+    totalSize = fmap.size()
			
 
				+    
			
 
				+    statDict = {}
			
 
				+    while True:
			
 
				+        curPos = fmap.tell();
			
 
				+        if curPos >= totalSize:
			
 
				+            break
			
 
				+
			
 
				+        lineContent = f_mmap.readline().decode("UTF-8")
			
 
				+        tmpList = pattern.findall(lineContent)
			
 
				+        if tmpList:
			
 
				+            cutList = list(jieba.cut_for_search(lineContent.replace("\r","").replace("\n","")))
			
 
				+            for cutKeyword in cutList:
			
 
				+                if cutKeyword in stopWord:
			
 
				+                    continue
			
 
				+
			
 
				+                count = statDict.get(cutKeyword)
			
 
				+                if count:
			
 
				+                    statDict[cutKeyword]=count+1
			
 
				+                else:
			
 
				+                    statDict[cutKeyword]=1
			
 
				+    
			
 
				+    sorted_key_list = sorted(statDict.items(), key=lambda x: x[1], reverse=True)
			
 
				+
			
 
				+    print("与关键词：%s 相关的词共计：%d" % (keyword, len(sorted_key_list)))
			
 
				+
			
 
				+    count_list = [ele for ele in statDict.values()]
			
 
				+
			
 
				+    tip("等于1", [val for val in count_list if val == 1])
			
 
				+
			
 
				+    tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
			
 
				+
			
 
				+    tip("大于等于100小于200", [val for val in count_list if val >= 100 and val < 200])
			
 
				+
			
 
				+    tip("大于等于200小于300", [val for val in count_list if val >= 200 and val < 300])
			
 
				+
			
 
				+    tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
			
 
				+
			
 
				+    tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
			
 
				+        
			
 
				+    tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
			
 
				+
			
 
				+    tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
			
 
				+
			
 
				+    tip("大于等于5000小于1万", [val for val in count_list if val >= 5000 and val < 10000])
			
 
				+
			
 
				+    tip("大于等于1万小于5万", [val for val in count_list if val >= 10000 and val < 50000])
			
 
				+
			
 
				+    tip("大于等于5万小于10万", [val for val in count_list if val >= 50000 and val < 100000])
			
 
				+
			
 
				+    tip("大于等于10万", [val for val in count_list if val >= 100000])
			
 
				+
			
 
				+    with open("./data/test/stat_%s.csv" % keyword, "w", encoding="UTF-8") as fw:
			
 
				+        for key, count in sorted_key_list:
			
 
				+            if count > 1:
			
 
				+                fw.write("%s,%d\n" % (key, count))
			
 
				+
			
 
				+def keyFilter(fmap: mmap.mmap, keyword:str):
			
 
				+    fmap.seek(0)
			
 
				+    pattern = re.compile(keyword)
			
 
				+    
			
 
				+    totalSize = fmap.size()
			
 
				+    
			
 
				+    with open("./data/test/filter_%s.csv" % keyword, "w", encoding="UTF-8") as fw:
			
 
				+        while True:
			
 
				+            curPos = fmap.tell();
			
 
				+            if curPos >= totalSize:
			
 
				+                break
			
 
				+
			
 
				+            lineContent = f_mmap.readline().decode("UTF-8")
			
 
				+            tmpList = pattern.findall(lineContent)
			
 
				+            if tmpList:
			
 
				+                fw.write("%s\n"%lineContent.replace("\r","").replace("\n",""))
			
 
				+            
			
 
				+
			
 
				+def countKeyword(fmap: mmap.mmap, keywords:set):
			
 
				+    for keyword in keywords:
			
 
				+        f_mmap.seek(0)
			
 
				+        pattern = re.compile(keyword)
			
 
				+    
			
 
				+        count=0
			
 
				+    
			
 
				+        while True:
			
 
				+            lineContent = f_mmap.readline().decode("UTF-8")
			
 
				+            if not lineContent:
			
 
				+                break
			
 
				+            
			
 
				+            tmpList = pattern.findall(lineContent)
			
 
				+            if tmpList:
			
 
				+                count += 1
			
 
				+        
			
 
				+        print("关键词：%s，共出现次数：%d" % (keyword, count))
			
 
				+
			
 
				+
			
 
				+INPUT_FILE = "./data/tmp/merge.csv"
			
 
				+
			
 
				+with open(INPUT_FILE, "r", encoding="UTF-8") as f, \
			
 
				+    mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as f_mmap:
			
 
				+
			
 
				+    filterSet = set();
			
 
				+    with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
			
 
				+        while True:
			
 
				+            lineContent = f_filter.readline().replace("\n","").replace("\r","")
			
 
				+            if not lineContent:
			
 
				+                break
			
 
				+            
			
 
				+            filterSet.add(lineContent)
			
 
				+    
			
 
				+    countKeyword(f_mmap, filterSet)
			
--- a/filter.py
+++ b/filter.py
@@ -0,0 +1,135 @@
 
				+# -*-: coding:utf-8 -*-
			
 
				+
			
 
				+import csv
			
 
				+import re
			
 
				+
			
 
				+def filter3():
			
 
				+    INPUT_DATA = r"./data/agg_filter3.csv"
			
 
				+    OUTPUT_TEMP = "./data/agg_filter4.csv"
			
 
				+
			
 
				+    startPattern = re.compile("######开始######")
			
 
				+    keyPattern = re.compile("赚钱")
			
 
				+
			
 
				+    total = []
			
 
				+    sub = None
			
 
				+    with open(INPUT_DATA, "r", encoding="GBK") as fr,\
			
 
				+        open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
			
 
				+
			
 
				+        for line in fr.readlines():
			
 
				+            
			
 
				+            tl = startPattern.findall(line)
			
 
				+            if len(tl) > 0:
			
 
				+                sub = []
			
 
				+                sub.append(line)
			
 
				+                total.append(sub)
			
 
				+            elif line.startswith("\n"):
			
 
				+                continue
			
 
				+            else:
			
 
				+                kl = keyPattern.findall(line)
			
 
				+                if len(kl)>0:
			
 
				+                    sub.append(line)
			
 
				+    
			
 
				+        sortedList = sorted(total, key=lambda x:len(x), reverse=True)
			
 
				+
			
 
				+        fw.write("统计信息")
			
 
				+        fw.write("%s%d\n" % ("总数：", len(sortedList)))
			
 
				+        fw.write("%s%d\n" %("大于等于1000：", len([subList for subList in sortedList if len(subList)>=1000])))
			
 
				+        fw.write("%s%d\n" %("大于等于500小于1000：", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
			
 
				+        fw.write("%s%d\n" %("大于等于100小于500：", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
			
 
				+        fw.write("%s%d\n" %("大于等于50小于100：", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
			
 
				+        fw.write("%s%d\n" %("大于等于10小于50：", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
			
 
				+        fw.write("%s%d\n" %("大于等于5小于10：", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
			
 
				+        fw.write("%s%d\n" %("大于等于3小于5：", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
			
 
				+        fw.write("%s%d\n" %("等于2：", len([subList for subList in sortedList if len(subList)==2])))
			
 
				+        fw.write("%s%d\n" %("等于1：", len([subList for subList in sortedList if len(subList)==1])))
			
 
				+       
			
 
				+        for subList in sortedList:
			
 
				+            if len(subList) == 1:
			
 
				+                continue
			
 
				+
			
 
				+            fw.write("\n")
			
 
				+            for line in subList:
			
 
				+                fw.write(line)
			
 
				+
			
 
				+def filter2():
			
 
				+    INPUT_DATA = r"./data/agg_filter.csv"
			
 
				+    OUTPUT_TEMP = "./data/agg_filter3.csv"
			
 
				+
			
 
				+    startPattern = re.compile("######开始######")
			
 
				+
			
 
				+    total = []
			
 
				+    sub = None
			
 
				+    with open(INPUT_DATA, "r", encoding="GBK") as fr,\
			
 
				+        open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
			
 
				+
			
 
				+        for line in fr.readlines():
			
 
				+            
			
 
				+            tl = startPattern.findall(line)
			
 
				+            if len(tl) > 0:
			
 
				+                sub = []
			
 
				+                sub.append(line)
			
 
				+                total.append(sub)
			
 
				+            elif line.startswith("\n"):
			
 
				+                continue
			
 
				+            else:
			
 
				+                sub.append(line)
			
 
				+    
			
 
				+        sortedList = sorted(total, key=lambda x:len(x), reverse=True)
			
 
				+
			
 
				+        fw.write("统计信息")
			
 
				+        fw.write("%s%d\n" % ("总数：", len(sortedList)))
			
 
				+        fw.write("%s%d\n" %("大于等于1000：", len([subList for subList in sortedList if len(subList)>=1000])))
			
 
				+        fw.write("%s%d\n" %("大于等于500小于1000：", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
			
 
				+        fw.write("%s%d\n" %("大于等于100小于500：", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
			
 
				+        fw.write("%s%d\n" %("大于等于50小于100：", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
			
 
				+        fw.write("%s%d\n" %("大于等于10小于50：", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
			
 
				+        fw.write("%s%d\n" %("大于等于5小于10：", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
			
 
				+        fw.write("%s%d\n" %("大于等于3小于5：", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
			
 
				+        fw.write("%s%d\n" %("等于2：", len([subList for subList in sortedList if len(subList)==2])))
			
 
				+        fw.write("%s%d\n" %("等于1：", len([subList for subList in sortedList if len(subList)==1])))
			
 
				+
			
 
				+        for subList in sortedList:
			
 
				+            if len(subList) == 1:
			
 
				+                continue
			
 
				+
			
 
				+            fw.write("\n")
			
 
				+            for line in subList:
			
 
				+                fw.write(line)
			
 
				+
			
 
				+        
			
 
				+
			
 
				+def filter1():
			
 
				+    # INPUT_DATA = r"E:\Documents\Code\LongTailKeyDataMining\agg.csv"
			
 
				+    INPUT_DATA = r"./data/agg_filter.csv"
			
 
				+    OUTPUT_TEMP = "./data/agg_filter2.csv"
			
 
				+
			
 
				+    filterPattern = []
			
 
				+    with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
			
 
				+        filterSet = set();
			
 
				+        while True:
			
 
				+            lineContent = f_filter.readline().replace("\n","").replace("\r","")
			
 
				+            if not lineContent:
			
 
				+                break
			
 
				+                
			
 
				+            filterSet.add(lineContent)
			
 
				+
			
 
				+        for r in filterSet:
			
 
				+            filterPattern.append(re.compile(r))
			
 
				+
			
 
				+    with open(INPUT_DATA, "r", encoding="GBK") as fr,\
			
 
				+        open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
			
 
				+
			
 
				+        for line in fr.readlines():
			
 
				+            writeFlag = True
			
 
				+            for p in filterPattern:
			
 
				+                l = p.findall(line)
			
 
				+                if len(l) > 0:
			
 
				+                    writeFlag = False
			
 
				+                    break
			
 
				+            
			
 
				+            if writeFlag:
			
 
				+                fw.write(line)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    filter3()
			
 
				+
			
--- a/split.py
+++ b/split.py
@@ -0,0 +1,58 @@
 
				+# -*-: coding:utf-8 -*-
			
 
				+
			
 
				+import csv
			
 
				+import re
			
 
				+
			
 
				+def split():
			
 
				+    INPUT_DATA = r"./data/agg_filter.csv"
			
 
				+    OUTPUT_TEMP = "./data/split/agg_split_%d.txt"
			
 
				+    OUTPUT_TEMP2 = "./data/split/agg_split_%d_%d.txt"
			
 
				+
			
 
				+    startPattern = re.compile("######开始######")
			
 
				+    con_l = []
			
 
				+    sub = None
			
 
				+    with open(INPUT_DATA, "r", encoding="GBK") as fr:
			
 
				+        for line in fr.readlines():
			
 
				+            tl = startPattern.findall(line)
			
 
				+            if len(tl) > 0:
			
 
				+                sub = []
			
 
				+                sub.append(line)
			
 
				+                con_l.append(sub)
			
 
				+            elif line.startswith("\n"):
			
 
				+                continue
			
 
				+            else:
			
 
				+                sub.append(line)
			
 
				+    
			
 
				+    # step = 71500
			
 
				+    # for i, v in enumerate(range(0, len(con_l), step)):
			
 
				+    #     with open(OUTPUT_TEMP % (i+1), "w", encoding="GBK") as fw:
			
 
				+    #         for ele in con_l[v:v+step]:
			
 
				+    #             if len(ele) == 1:
			
 
				+    #                 continue
			
 
				+
			
 
				+    #             fw.write("\n")
			
 
				+    #             for content in ele:
			
 
				+    #                 fw.write(content)
			
 
				+    filter_l = [
			
 
				+        (1000, 1000, [subList for subList in con_l if len(subList)>=1000]),
			
 
				+        (500, 1000, [subList for subList in con_l if len(subList)>=500 and len(subList) < 1000]),
			
 
				+        (100,500,[subList for subList in con_l if len(subList)>=100 and len(subList) < 500]),
			
 
				+        (50,100,[subList for subList in con_l if len(subList)>=50 and len(subList)<100]),
			
 
				+        (10,50,[subList for subList in con_l if len(subList)>=10 and len(subList)<50]),
			
 
				+        (5,10,[subList for subList in con_l if len(subList)>=5 and len(subList)<10]),
			
 
				+        (3,5,[subList for subList in con_l if len(subList)>=3 and len(subList)<5]),
			
 
				+        (2,2,[subList for subList in con_l if len(subList)==2])
			
 
				+        # (1,1,[subList for subList in con_l if len(subList)==1])
			
 
				+    ]
			
 
				+
			
 
				+    for start, end, sublist in filter_l:
			
 
				+        with open(OUTPUT_TEMP2 % (start, end), "w", encoding="GBK") as fw:
			
 
				+            for ele in sublist:
			
 
				+                fw.write("\n")
			
 
				+                for content in ele:
			
 
				+                    fw.write(content)
			
 
				+    
			
 
				+    
			
 
				+   
			
 
				+if __name__ == '__main__':
			
 
				+    split()
			
--- a/tools.py
+++ b/tools.py
@@ -186,4 +186,10 @@ def avg_split_task(total:int, split_internal:int):
 
				             end_pos = -1
			
 
				         tasks.append([start_pos,end_pos])
			
 
				     
			
 
				-    return tasks
			
 
				+    return tasks
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    stop_word = load_stop_word()
			
 
				+    with open("./data/stopword.txt","w",encoding="UTF-8") as f:
			
 
				+        for stopWord in stop_word.keys():
			
 
				+            f.write("%s\n" % stopWord)