2 tahun lalu · 68e0564f5e
--- a/README.md
+++ b/README.md
@@ -1,12 +1,21 @@
 
				 # 开发记录
			
 
				 
			
 
				 ## 待办列表
			
 
				-* 多进程多线程使用tqdm显示进度
			
 
				+* 主进程进度不显示
			
 
				+* 子进程显示不合理
			
 
				+* 聚合速度需要进一步优化
			
 
				+* 修改缓存建立方式（目前：1.5秒/个，期望：降到目前的10倍以下）
			
 
				+* 修改子进程任务获取方式
			
 
				 
			
 
				 ## 开发进度
			
 
				+* 2024-01-18
			
 
				+  - 移除bitmap依赖包
			
 
				+  - 导出conda环境文件
			
 
				+  - 计算结果文件改为归档而不是删除
			
 
				 * 2024-01-17
			
 
				   - 增加多进程初始化
			
 
				   - 长尾词聚合增加使用多线程
			
 
				+  - 多进程多线程使用tqdm显示进度
			
 
				 * 2024-01-16
			
 
				   - 增加使用redis，提高性能 
			
 
				 * 2023-12-15
			
@@ -39,3 +48,10 @@
 
				   * （这个结论没啥用处，因为多进程一般是完成不同的任务）仅以顺序读取同一个文件，with open 与 mmap 均是 单进程读取一次的速度 比 多进程读取一次要快，而且with open的差距更明显
			
 
				   * 多进程分段读取中 mmap比with open快很多，with open非常的慢
			
 
				 
			
 
				+## conda命令
			
 
				+conda环境重命名（复制一个新的环境，然后把旧的环境删除）
			
 
				+```commandline
			
 
				+conda create -n tf --clone rcnn #把环境 rcnn 重命名成 tf
			
 
				+conda remove -n rcnn --all 
			
 
				+```
			
 
				+
			
--- a/src/agg.py
+++ b/src/agg.py
@@ -2,7 +2,9 @@
 
				 import math
			
 
				 import os
			
 
				 import re
			
 
				+import shutil
			
 
				 import threading
			
 
				+import time
			
 
				 from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
			
 
				 
			
 
				 import jieba
			
@@ -21,6 +23,9 @@ FILE_LONG_TAIL_MERGE_SPLIT = "长尾词_合并_分词.txt"
 
				 # 文件：长尾词_合并_聚合.txt
			
 
				 FILE_LONG_TAIL_MERGE_AGG = "长尾词_合并_聚合.txt"
			
 
				 
			
 
				+# 文件夹：历史聚合数据归档文件夹
			
 
				+DIR_AGG_FILE_ARCHIVE = "长尾词_聚合_归档_%s"
			
 
				+
			
 
				 # 文件：长尾词_合并_分词倒排索引.txt
			
 
				 FILE_LONG_TAIL_MERGE_REVERSE_INDEX = "长尾词_合并_倒排索引.txt"
			
 
				 
			
@@ -95,11 +100,15 @@ def agg_word(file_path: str):
 
				         if os.path.exists(input_file) and not os.path.isfile(input_file):
			
 
				             raise Exception("文件不存在！文件路径：" + input_file)
			
 
				 
			
 
				-    # 删除历史数据文件
			
 
				-    for file in os.listdir(file_path):
			
 
				-        if agg_file_pattern.match(file):
			
 
				-            os.remove(os.path.join(file_path, file))
			
 
				+    # 归档历史数据文件
			
 
				+    history_agg_file_list = [file for file in os.listdir(file_path) if agg_file_pattern.match(file)]
			
 
				+    if len(history_agg_file_list) > 0:
			
 
				+        archive_path = os.path.join(file_path, DIR_AGG_FILE_ARCHIVE % time.strftime('%Y%m%d%H%M%S'))
			
 
				+        os.makedirs(archive_path)
			
 
				+        for history_agg_file in history_agg_file_list:
			
 
				+            shutil.move(os.path.join(file_path, history_agg_file), archive_path)
			
 
				 
			
 
				+    return
			
 
				     # 缓存关键词位置
			
 
				     word_file = os.path.join(file_path, FILE_LONG_TAIL_MERGE)
			
 
				     word_dict = {}