2 years ago · 97c406ebd0
--- a/.editorconfig
+++ b/.editorconfig
@@ -5,7 +5,7 @@ root = true
 
				 #缩进风格：空格
			
 
				 indent_style = space
			
 
				 #缩进大小2
			
 
				-indent_size = 2
			
 
				+indent_size = 4
			
 
				 #换行符lf
			
 
				 end_of_line = lf
			
 
				 #字符集utf-8
			
--- a/.gitignore
+++ b/.gitignore
@@ -1,17 +1,5 @@
 
				 __pycache__/
			
 
				-data/
			
 
				-data/tmp/*.pkl
			
 
				-data/tmp/*.txt
			
 
				-data/tmp/*.csv
			
 
				-data/pkl/*.pkl
			
 
				-data/*.txt
			
 
				-data/*.csv
			
 
				-临时/
			
 
				-data_bak/
			
 
				-src_bak/
			
 
				-data/analyse/
			
 
				-data/analyse_bak/
			
 
				-data/cache/
			
 
				+
			
 
				 # Byte-compiled / optimized / DLL files
			
 
				 __pycache__/
			
 
				 *.py[cod]
			
@@ -174,4 +162,5 @@ cython_debug/
 
				 .idea/
			
 
				 *.iml
			
 
				 
			
 
				-src/config.json
			
 
				+tmp/
			
 
				+!src/tmp
			
--- a/README.md
+++ b/README.md
@@ -13,3 +13,49 @@
 
				   - 增加支持4级筛选;调整界面结构，增大结果区域
			
 
				   - 增加历史记录功能，能打开上一次的位置
			
 
				   - 增加显示筛选数量和结果数量
			
 
				+
			
 
				+# 待办日志
			
 
				+1. 链式调用
			
 
				+2. 聚合结果分析
			
 
				+
			
 
				+# 处理步骤
			
 
				+
			
 
				+1. 从5118下载泛词（csv文件）
			
 
				+
			
 
				+2. 对泛词进行分词处理（cut.py）
			
 
				+
			
 
				+  * 分词和词频统计
			
 
				+  * 根据词频进行倒序排列
			
 
				+
			
 
				+3. 根据词频获取拓展词
			
 
				+
			
 
				+4. 把所有拓展词合并到一个文件中（merge.py）
			
 
				+
			
 
				+5. 生成关键词文件，包含三个要素：序号、关键词、分词结果（key.py）
			
 
				+
			
 
				+6. 对关键词文件生成索引文件（key_index.py）
			
 
				+
			
 
				+7. 根据关键词文件生成倒排文件（key_reverse.py）
			
 
				+
			
 
				+8. 根据关键词文件、索引文件、倒排文件生成最终的聚合分析文件（agg_word.py）
			
 
				+
			
 
				+
			
 
				+# 进程相关
			
 
				+1. 普通的只有一个进程
			
 
				+2. 普通使用ProcessPoolExecutor只有4个进程
			
 
				+3. 如果使用Manager会额外多一个进程
			
 
				+
			
 
				+# 文件位置相关
			
 
				+1. with open 与 mmap 读取到的每行的位置是一样的
			
 
				+
			
 
				+# 文件读取速度比较
			
 
				+1. 从单进程的角度看：仅以顺序读文件来说mmap的速度快很多，如果对内容进行编码，速度也较with open快一点
			
 
				+2. 从多进程的角度看：仅以顺序读文件来说mmap的速度快很多，如果对内容进行编码，速度也较with open快一点
			
 
				+3. 仅以顺序读取同一个文件，with open 与 mmap 均是 单进程读取一次的速度 比 多进程读取一次要快，而且with open的差距更明显
			
 
				+4. 多进程分段读取中 mmap比with open快很多，with open非常的慢
			
 
				+* 总结：
			
 
				+  * mmap（单）>>mmap(单，对内容进行编码)>with open(单)
			
 
				+  * mmap（多）>>mmap(多，对内容进行编码)>with open(多)
			
 
				+  * （这个结论没啥用处，因为多进程一般是完成不同的任务）仅以顺序读取同一个文件，with open 与 mmap 均是 单进程读取一次的速度 比 多进程读取一次要快，而且with open的差距更明显
			
 
				+  * 多进程分段读取中 mmap比with open快很多，with open非常的慢
			
 
				+
			
--- a/REMEAD.md
+++ b/REMEAD.md
@@ -1,45 +0,0 @@
 
				-# 待办日志
			
 
				-1. 链式调用
			
 
				-2. 聚合结果分析
			
 
				-
			
 
				-# 处理步骤
			
 
				-
			
 
				-1. 从5118下载泛词（csv文件）
			
 
				-
			
 
				-2. 对泛词进行分词处理（cut.py）
			
 
				-
			
 
				-    * 分词和词频统计
			
 
				-    * 根据词频进行倒序排列
			
 
				-
			
 
				-3. 根据词频获取拓展词
			
 
				-
			
 
				-4. 把所有拓展词合并到一个文件中（merge.py）
			
 
				-
			
 
				-5. 生成关键词文件，包含三个要素：序号、关键词、分词结果（key.py）
			
 
				-
			
 
				-6. 对关键词文件生成索引文件（key_index.py）
			
 
				-
			
 
				-7. 根据关键词文件生成倒排文件（key_reverse.py）
			
 
				-
			
 
				-8. 根据关键词文件、索引文件、倒排文件生成最终的聚合分析文件（agg_word.py）
			
 
				-
			
 
				-
			
 
				-# 进程相关
			
 
				-1. 普通的只有一个进程
			
 
				-2. 普通使用ProcessPoolExecutor只有4个进程
			
 
				-3. 如果使用Manager会额外多一个进程
			
 
				-
			
 
				-# 文件位置相关
			
 
				-1. with open 与 mmap 读取到的每行的位置是一样的
			
 
				-
			
 
				-# 文件读取速度比较
			
 
				-1. 从单进程的角度看：仅以顺序读文件来说mmap的速度快很多，如果对内容进行编码，速度也较with open快一点
			
 
				-2. 从多进程的角度看：仅以顺序读文件来说mmap的速度快很多，如果对内容进行编码，速度也较with open快一点
			
 
				-3. 仅以顺序读取同一个文件，with open 与 mmap 均是 单进程读取一次的速度 比 多进程读取一次要快，而且with open的差距更明显
			
 
				-4. 多进程分段读取中 mmap比with open快很多，with open非常的慢
			
 
				-* 总结：
			
 
				-    * mmap（单）>>mmap(单，对内容进行编码)>with open(单)
			
 
				-    * mmap（多）>>mmap(多，对内容进行编码)>with open(多)
			
 
				-    * （这个结论没啥用处，因为多进程一般是完成不同的任务）仅以顺序读取同一个文件，with open 与 mmap 均是 单进程读取一次的速度 比 多进程读取一次要快，而且with open的差距更明显
			
 
				-    * 多进程分段读取中 mmap比with open快很多，with open非常的慢
			
 
				-
			
--- a/env.yaml
+++ b/env.yaml
--- a/src/conf/stopwords/baidu_stopwords.txt
+++ b/src/conf/stopwords/baidu_stopwords.txt
--- a/src/conf/stopwords/cn_stopwords.txt
+++ b/src/conf/stopwords/cn_stopwords.txt
--- a/src/conf/stopwords/hit_stopwords.txt
+++ b/src/conf/stopwords/hit_stopwords.txt
--- a/src/conf/stopwords/scu_stopwords.txt
+++ b/src/conf/stopwords/scu_stopwords.txt
--- a/src/conf/stopwords/停用词.txt
+++ b/src/conf/stopwords/停用词.txt
--- a/src/money.py
+++ b/src/money.py
@@ -0,0 +1,98 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import os
			
 
				+import utils
			
 
				+import jieba
			
 
				+import zipfile
			
 
				+
			
 
				+
			
 
				+def extract_word_from_5118(file_path: str):
			
 
				+    """
			
 
				+    从5118关键词压缩文件中提取数据
			
 
				+    :param file_path: 待处理文件夹路径
			
 
				+    :return: None
			
 
				+    """
			
 
				+    file_list = []
			
 
				+    for file in os.listdir(file_path):
			
 
				+        file_list.append(os.path.join(file_path, file))
			
 
				+
			
 
				+    for i, file in enumerate(file_list):
			
 
				+        zfile = zipfile.ZipFile(file)
			
 
				+        filenames = zfile.namelist()
			
 
				+        for filename in filenames:
			
 
				+            # 重新编码文件名为正确形式
			
 
				+            real_name = filename.encode('cp437').decode('gbk')
			
 
				+
			
 
				+            # 排除无效文件
			
 
				+            if real_name in ['打开乱码如何处理？.txt']:
			
 
				+                continue
			
 
				+
			
 
				+            # 关键词存放容器
			
 
				+            word_container = set()
			
 
				+
			
 
				+            # 读取压缩文件中的文件
			
 
				+            with zfile.open(filename) as file_content:
			
 
				+                lines = file_content.readlines()
			
 
				+                # 跳过开头两行
			
 
				+                for line in lines[2:]:
			
 
				+                    split = line.decode("gbk").split(",")
			
 
				+                    # 只需要第一列的数据
			
 
				+                    word_container.add(split[0])
			
 
				+
			
 
				+            output_file_name = real_name[0:real_name.index("--")]
			
 
				+            output_file_path = os.path.join(file_path, output_file_name + "_长尾词.txt")
			
 
				+            with open(output_file_path, "w", encoding="utf-8") as f:
			
 
				+                for item in word_container:
			
 
				+                    f.write(item)
			
 
				+                    f.write("\n")
			
 
				+
			
 
				+
			
 
				+def word_split_statistics(file_path: str):
			
 
				+    """
			
 
				+    分词统计
			
 
				+    :param file_path: 待处理文件夹路径
			
 
				+    :return: None
			
 
				+    """
			
 
				+
			
 
				+    file_list = []
			
 
				+    for file in os.listdir(file_path):
			
 
				+        file_list.append(os.path.join(file_path, file))
			
 
				+
			
 
				+    stop_word_dict = utils.load_stop_word()
			
 
				+
			
 
				+    for i, file in enumerate(file_list):
			
 
				+        if not file.endswith("_长尾词.txt"):
			
 
				+            continue
			
 
				+
			
 
				+        # 分词结果容器
			
 
				+        key_dict = {}
			
 
				+
			
 
				+        with open(file, "r", encoding="utf-8") as f:
			
 
				+            for word in f:
			
 
				+                # 分词
			
 
				+                word_list = jieba.cut_for_search(word.replace("\n", ""))
			
 
				+                # 统计
			
 
				+                for word in word_list:
			
 
				+                    # 过滤停用词
			
 
				+                    if word in stop_word_dict:
			
 
				+                        continue
			
 
				+
			
 
				+                    if word in key_dict:
			
 
				+                        key_dict[word] = key_dict[word] + 1
			
 
				+                    else:
			
 
				+                        key_dict[word] = 1
			
 
				+
			
 
				+        # 根据词频进行倒序排列
			
 
				+        sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
			
 
				+
			
 
				+        output_file_name = file[file.rindex("\\")+1:file.index("_长尾词.txt")]
			
 
				+        output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
			
 
				+        with open(output_file_path, "w", encoding="UTF-8") as f:
			
 
				+            for key, count in sorted_key_list:
			
 
				+                f.write("%s,%d\n" % (key, count))
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    file_path = "E:\Download\原始词库"
			
 
				+    # file_path = "E:\Download\测试"
			
 
				+    extract_word_from_5118(file_path)
			
 
				+    # word_split_statistics(file_path)
			
--- a/src/tmp/__init__.py
+++ b/src/tmp/__init__.py
--- a/src/tmp/agg_word.py
+++ b/src/tmp/agg_word.py
--- a/src/tmp/analyse.py
+++ b/src/tmp/analyse.py
--- a/src/tmp/cal.py
+++ b/src/tmp/cal.py
--- a/src/tmp/config.py
+++ b/src/tmp/config.py
@@ -40,7 +40,7 @@ AGG_ANALYSE_FILE = "./data/analyse/%s.csv"
 
				 AGG_FILE = "./data/agg_analyse.csv"
			
 
				 
			
 
				 # 停用词存放文件夹
			
 
				-STOP_WORD_DIR = "./data/stopwords"
			
 
				+STOP_WORD_DIR = "../conf/stopwords"
			
 
				 
			
 
				 # 停用词模型 缓存
			
 
				 STOP_WORD_CACHE = "./data/cache/stop_word.pkl"
			
@@ -58,4 +58,4 @@ RE_SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}"
 
				 PRECENT_TIPS = 0.01
			
 
				 
			
 
				 # 正则提取关键词表中的信息
			
 
				-KEY_RE_PATTERAN = r"(\d+),([^,]*),(.*)"
			
 
				+KEY_RE_PATTERAN = r"(\d+),([^,]*),(.*)"
			
--- a/src/tmp/cut.py
+++ b/src/tmp/cut.py
--- a/src/tmp/filter.py
+++ b/src/tmp/filter.py
--- a/src/tmp/filter/DataFilter.py
+++ b/src/tmp/filter/DataFilter.py
--- a/src/tmp/filter/DataFilter.ui
+++ b/src/tmp/filter/DataFilter.ui
--- a/src/tmp/filter/__init__.py
+++ b/src/tmp/filter/__init__.py
--- a/src/tmp/filter/main.py
+++ b/src/tmp/filter/main.py
@@ -5,10 +5,10 @@ import re
 
				 import sys
			
 
				 from functools import partial
			
 
				 
			
 
				-from PySide6.QtWidgets import QMainWindow, QApplication, QMessageBox, QFileDialog, QWidget, QLineEdit, QPushButton, \
			
 
				+from PySide6.QtWidgets import QApplication, QMessageBox, QFileDialog, QWidget, QLineEdit, QPushButton, \
			
 
				   QTextEdit, QTextBrowser
			
 
				 
			
 
				-from src.DataFilter import Ui_Form
			
 
				+from src.tmp.filter.DataFilter import Ui_Form
			
 
				 
			
 
				 category_pattern = re.compile(r'\[类别\]')
			
 
				 digit_pattern = re.compile(r'\[数字\]')
			
--- a/src/tmp/key.py
+++ b/src/tmp/key.py
--- a/src/tmp/key_index.py
+++ b/src/tmp/key_index.py
--- a/src/tmp/key_reverse.py
+++ b/src/tmp/key_reverse.py
--- a/src/tmp/key_reverse_index.py
+++ b/src/tmp/key_reverse_index.py
--- a/src/tmp/key_reverse_statistics.py
+++ b/src/tmp/key_reverse_statistics.py
@@ -2,19 +2,12 @@
 
				 
			
 
				 from concurrent.futures import ProcessPoolExecutor, as_completed
			
 
				 import mmap
			
 
				-from multiprocessing.connection import wait
			
 
				-import random
			
 
				-import sys
			
 
				-from time import sleep, time
			
 
				 import os
			
 
				 import config
			
 
				 import tools
			
 
				 import ast
			
 
				-import re
			
 
				-import stop_word
			
 
				 import logging
			
 
				 import math
			
 
				-from multiprocessing import Process, Pool
			
 
				 
			
 
				 TITLE = "关键词倒排文件 统计"
			
 
				 
			
@@ -26,7 +19,7 @@ def handle(start_pos, end_pos):
 
				 
			
 
				     # 统计信息容器
			
 
				     reverse_statistics = {}
			
 
				-    
			
 
				+
			
 
				     with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
			
 
				         mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
			
 
				         # 调整开始位置
			
@@ -37,23 +30,23 @@ def handle(start_pos, end_pos):
 
				             # 越界检测
			
 
				             if cur_pos >= end_pos:
			
 
				                 break
			
 
				-            
			
 
				+
			
 
				             line = fmmap.readline().decode(config.ENCODING_CHARSET)
			
 
				             index=line.index(",")
			
 
				             key = line[:index]
			
 
				             word_root = line[index+1:]
			
 
				             word_root = ast.literal_eval(word_root)
			
 
				             l = len(word_root)
			
 
				-            
			
 
				+
			
 
				             reverse_statistics[key]=l
			
 
				-            
			
 
				+
			
 
				     logging.info("进程：%d， 统计结束" % os.getpid())
			
 
				 
			
 
				     return {
			
 
				         "pid":os.getpid(),
			
 
				         "statistics":reverse_statistics
			
 
				     }
			
 
				-    
			
 
				+
			
 
				 
			
 
				 def main2():
			
 
				     # 日志信息配置
			
@@ -89,7 +82,7 @@ def main2():
 
				         # 记录位置信息
			
 
				         pos_list.append(pos)
			
 
				 
			
 
				-     
			
 
				+
			
 
				     # 使用用进程池
			
 
				     pool = ProcessPoolExecutor(process_num)
			
 
				     # 生成任务
			
@@ -97,7 +90,7 @@ def main2():
 
				     for i in range(0, len(pos_list)-1):
			
 
				         pos = pos_list[i: i+2]
			
 
				         process_futures.append(pool.submit(handle, pos[0], pos[1]))
			
 
				-    
			
 
				+
			
 
				     # with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
			
 
				     #     for future in as_completed(process_futures):
			
 
				     #         logging.info("部分子任务统计结束，保存至本地 - 开始")
			
@@ -105,7 +98,7 @@ def main2():
 
				     #             fw.write("%s,%s\n"%(key,value))
			
 
				     #         logging.info("部分子任务统计结束，保存至本地 - 结束")
			
 
				 
			
 
				-    
			
 
				+
			
 
				     results = []
			
 
				     for future in as_completed(process_futures):
			
 
				         result = future.result()
			
@@ -142,14 +135,14 @@ def main2():
 
				     # p = Process(target=handle, args=('测试进程', 1, 10))
			
 
				     # p.start()
			
 
				     # p.join()
			
 
				-    
			
 
				+
			
 
				     # tools.init_log()
			
 
				     # tools.log_start_msg(TITLE)
			
 
				 
			
 
				     # key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
			
 
				 
			
 
				     # tmp = [key for key in key_reverse_index.keys()]
			
 
				-    
			
 
				+
			
 
				     # l = len(tmp)
			
 
				     # print("总长：", l)
			
 
				     # internal = math.ceil(l / 4)
			
@@ -168,7 +161,7 @@ def main2():
 
				     #     pos = key_reverse_index[key[0]]
			
 
				     #     print(key, pos)
			
 
				 
			
 
				-    
			
 
				+
			
 
				     # reverse_statistics = {}
			
 
				     # logging.info("统计开始")
			
 
				     # with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
			
@@ -179,7 +172,7 @@ def main2():
 
				     #         word_root = line[index+1:]
			
 
				     #         word_root = ast.literal_eval(word_root)
			
 
				     #         l = len(word_root)
			
 
				-            
			
 
				+
			
 
				     #         reverse_statistics[key]=l
			
 
				 
			
 
				     # logging.info("统计结束，保存至本地")
			
@@ -193,7 +186,7 @@ def main2():
 
				 def main():
			
 
				     tools.init_log()
			
 
				     tools.log_start_msg(TITLE)
			
 
				-    
			
 
				+
			
 
				     reverse_statistics = {}
			
 
				     logging.info("统计开始")
			
 
				     with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
			
@@ -204,7 +197,7 @@ def main():
 
				             word_root = line[index+1:]
			
 
				             word_root = ast.literal_eval(word_root)
			
 
				             l = len(word_root)
			
 
				-            
			
 
				+
			
 
				             reverse_statistics[key]=l
			
 
				 
			
 
				     logging.info("统计结束，保存至本地")
			
@@ -215,4 +208,4 @@ def main():
 
				     tools.log_end_msg(TITLE)
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main2()
			
 
				+    main2()
			
--- a/src/tmp/logging.conf
+++ b/src/tmp/logging.conf
--- a/src/tmp/merge.py
+++ b/src/tmp/merge.py
@@ -28,7 +28,7 @@ def merge_file_content():
 
				     dir_path : string
			
 
				         待读取的文件夹
			
 
				 
			
 
				-    dest_file : string 
			
 
				+    dest_file : string
			
 
				         合并后输出的文件
			
 
				 
			
 
				     exclude_file : list
			
@@ -56,7 +56,7 @@ def merge_file_content():
 
				 
			
 
				             # 重新编码文件名为正确形式
			
 
				             realname = filename.encode('cp437').decode('gbk')
			
 
				-                
			
 
				+
			
 
				             # 排除无效文件
			
 
				             if realname in config.MERGE_EXCLUDE_FILES:
			
 
				                 continue
			
@@ -73,15 +73,16 @@ def merge_file_content():
 
				                     repeat_set.add(split[0])
			
 
				                     # 记录次数
			
 
				                     total_count = total_count + 1
			
 
				-        
			
 
				+
			
 
				         tools.tip(total_num, i)
			
 
				-    
			
 
				-    logging.info("正在保存合并结果，文件位置：%s，排重前数据量：%d，排重后数据量：%d" % (config.MERGE_FILE, total_count, len(repeat_set)))
			
 
				+
			
 
				+    logging.info("正在保存合并结果，文件位置：%s，排重前数据量：%d，排重后数据量：%d" % (
			
 
				+      config.MERGE_FILE, total_count, len(repeat_set)))
			
 
				     with open(config.MERGE_FILE, "w", encoding="utf-8") as f:
			
 
				         for item in repeat_set:
			
 
				             f.write(item)
			
 
				             f.write("\n")
			
 
				-    
			
 
				+
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				 
			
@@ -94,4 +95,3 @@ if __name__ == '__main__':
 
				     merge_file_content()
			
 
				 
			
 
				     tools.log_end_msg(TITLE)
			
 
				-        
			
--- a/src/tmp/split.py
+++ b/src/tmp/split.py
--- a/src/tmp/statistics.py
+++ b/src/tmp/statistics.py
@@ -1,8 +1,5 @@
 
				 # -*- coding:utf-8 -*-
			
 
				 
			
 
				-from collections import namedtuple
			
 
				-from dataclasses import make_dataclass
			
 
				-import os
			
 
				 import tools
			
 
				 import config
			
 
				 import logging
			
@@ -47,9 +44,9 @@ def cost_statistics():
 
				             count = int(line[first_index+1:])
			
 
				             count_list.append(count)
			
 
				             total_count = total_count + count
			
 
				-        
			
 
				+
			
 
				         logging.info("总祠根数：%d, 涉及的总分词查找数：%d" % (len(count_list), total_count))
			
 
				-        
			
 
				+
			
 
				         tip("等于1", [val for val in count_list if val == 1])
			
 
				 
			
 
				         tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
			
@@ -61,7 +58,7 @@ def cost_statistics():
 
				         tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
			
 
				 
			
 
				         tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
			
 
				-        
			
 
				+
			
 
				         tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
			
 
				 
			
 
				         tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
			
@@ -100,13 +97,13 @@ def memory_statistics():
 
				     logging.info("查找结束位置")
			
 
				     with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
			
 
				         mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
			
 
				-        
			
 
				+
			
 
				         logging.info("开始构建缓存")
			
 
				         cache = {}
			
 
				         start = time.time()
			
 
				         while True:
			
 
				             cur_pos = fmmap.tell()
			
 
				-            
			
 
				+
			
 
				             if cur_pos > end_pos:
			
 
				                 break
			
 
				 
			
@@ -127,7 +124,7 @@ def memory_statistics():
 
				 
			
 
				         time.sleep(20)
			
 
				         logging.info('留20s进行内存观察')
			
 
				-        
			
 
				+
			
 
				 
			
 
				 def main():
			
 
				 
			
@@ -135,10 +132,10 @@ def main():
 
				     tools.log_start_msg(TASK_TITLE)
			
 
				 
			
 
				     memory_statistics()
			
 
				-        
			
 
				+
			
 
				 
			
 
				     tools.log_end_msg(TASK_TITLE)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main()
			
 
				+    main()
			
--- a/src/tmp/tools.py
+++ b/src/tmp/tools.py
@@ -6,7 +6,6 @@ import os
 
				 import config
			
 
				 import logging.config
			
 
				 import pickle
			
 
				-import mmap
			
 
				 
			
 
				 TITLE = "工具类"
			
 
				 
			
@@ -17,7 +16,7 @@ def init_log():
 
				     日志初始化工具
			
 
				     """
			
 
				     # 读取日志配置文件内容
			
 
				-    logging.config.fileConfig('./logging.conf')
			
 
				+    logging.config.fileConfig('logging.conf')
			
 
				 
			
 
				     # 用一个没有在配置文件中定义的logger名称来创建一个日志器logger
			
 
				     return logging.getLogger()
			
@@ -45,7 +44,7 @@ def get_tip_internal(total_num):
 
				         internal = math.ceil(total_num * config.PRECENT_TIPS)
			
 
				         tip_internal_cache[total_num] = internal
			
 
				     return internal
			
 
				-    
			
 
				+
			
 
				 
			
 
				 def tip(total_num, cur_num, is_zero_base=True):
			
 
				     """
			
@@ -93,7 +92,7 @@ def tip_in_size(total_size, cur_pos):
 
				         }
			
 
				         # 放入缓存
			
 
				         tip_internal_cache[total_size] = tip_internal
			
 
				-    
			
 
				+
			
 
				     # 当前位置超过提示检查点则显示进度
			
 
				     if cur_pos >= tip_internal["check_point"]:
			
 
				 
			
@@ -112,7 +111,7 @@ def tip_in_size(total_size, cur_pos):
 
				                 check_point = total_size
			
 
				                 # 如果不手动中断会陷入循环
			
 
				                 break
			
 
				-        
			
 
				+
			
 
				         # 更新 提示检查点
			
 
				         tip_internal["check_point"] = check_point
			
 
				 
			
@@ -158,12 +157,12 @@ def load_stop_word():
 
				     stop_word_dict = {}
			
 
				     for item in stop_word:
			
 
				         stop_word_dict[item]=None
			
 
				-    
			
 
				+
			
 
				     logging.debug("把停用词缓存保存到本地")
			
 
				 
			
 
				     # 保存本地作为缓存
			
 
				     save_obj(config.STOP_WORD_CACHE, stop_word_dict)
			
 
				-    
			
 
				+
			
 
				     return stop_word_dict
			
 
				 
			
 
				 
			
@@ -185,7 +184,7 @@ def avg_split_task(total:int, split_internal:int):
 
				         if end_pos >= total:
			
 
				             end_pos = -1
			
 
				         tasks.append([start_pos,end_pos])
			
 
				-    
			
 
				+
			
 
				     return tasks
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/src/utils.py
+++ b/src/utils.py
@@ -0,0 +1,57 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import os
			
 
				+import pickle
			
 
				+
			
 
				+# 停用词存放文件夹
			
 
				+STOP_WORD_DIR = "./conf/stopwords"
			
 
				+
			
 
				+# 停用词模型 缓存
			
 
				+STOP_WORD_CACHE = "../tmp/stop_word.pkl"
			
 
				+
			
 
				+def save_obj(path, obj):
			
 
				+    """
			
 
				+    保存对象至本地
			
 
				+    """
			
 
				+    with open(path, "wb") as f:
			
 
				+        pickle.dump(obj, f)
			
 
				+
			
 
				+
			
 
				+def load_obj(path):
			
 
				+    """
			
 
				+    加载对象
			
 
				+    """
			
 
				+    with open(path, "rb") as f:
			
 
				+        return pickle.load(f)
			
 
				+
			
 
				+
			
 
				+def load_stop_word():
			
 
				+    """
			
 
				+    加载停用词
			
 
				+    """
			
 
				+
			
 
				+    # 判断是否存在缓存
			
 
				+    if os.path.exists(STOP_WORD_CACHE):
			
 
				+        return load_obj(STOP_WORD_CACHE)
			
 
				+
			
 
				+    # 停用词容器
			
 
				+    stop_word = set()
			
 
				+
			
 
				+    # 构建停用词列表
			
 
				+    stop_word_files = os.listdir(STOP_WORD_DIR)
			
 
				+    for file in stop_word_files:
			
 
				+        stop_word_file = os.path.join(STOP_WORD_DIR, file)
			
 
				+        with open(stop_word_file, encoding="UTF-8") as f:
			
 
				+            for item in f:
			
 
				+                # 移除换行符
			
 
				+                stop_word.add(item.replace("\n", "").replace("\r", ""))
			
 
				+
			
 
				+    # 改成dict提升检索速度
			
 
				+    stop_word_dict = {}
			
 
				+    for item in stop_word:
			
 
				+        stop_word_dict[item] = None
			
 
				+
			
 
				+    # 保存本地作为缓存
			
 
				+    save_obj(STOP_WORD_CACHE, stop_word_dict)
			
 
				+
			
 
				+    return stop_word_dict