|
|
@@ -2,7 +2,9 @@
|
|
|
import math
|
|
|
import os
|
|
|
import re
|
|
|
+import shutil
|
|
|
import threading
|
|
|
+import time
|
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed, ThreadPoolExecutor
|
|
|
|
|
|
import jieba
|
|
|
@@ -21,6 +23,9 @@ FILE_LONG_TAIL_MERGE_SPLIT = "长尾词_合并_分词.txt"
|
|
|
# 文件:长尾词_合并_聚合.txt
|
|
|
FILE_LONG_TAIL_MERGE_AGG = "长尾词_合并_聚合.txt"
|
|
|
|
|
|
+# 文件夹:历史聚合数据归档文件夹
|
|
|
+DIR_AGG_FILE_ARCHIVE = "长尾词_聚合_归档_%s"
|
|
|
+
|
|
|
# 文件:长尾词_合并_分词倒排索引.txt
|
|
|
FILE_LONG_TAIL_MERGE_REVERSE_INDEX = "长尾词_合并_倒排索引.txt"
|
|
|
|
|
|
@@ -95,11 +100,15 @@ def agg_word(file_path: str):
|
|
|
if os.path.exists(input_file) and not os.path.isfile(input_file):
|
|
|
raise Exception("文件不存在!文件路径:" + input_file)
|
|
|
|
|
|
- # 删除历史数据文件
|
|
|
- for file in os.listdir(file_path):
|
|
|
- if agg_file_pattern.match(file):
|
|
|
- os.remove(os.path.join(file_path, file))
|
|
|
+ # 归档历史数据文件
|
|
|
+ history_agg_file_list = [file for file in os.listdir(file_path) if agg_file_pattern.match(file)]
|
|
|
+ if len(history_agg_file_list) > 0:
|
|
|
+ archive_path = os.path.join(file_path, DIR_AGG_FILE_ARCHIVE % time.strftime('%Y%m%d%H%M%S'))
|
|
|
+ os.makedirs(archive_path)
|
|
|
+ for history_agg_file in history_agg_file_list:
|
|
|
+ shutil.move(os.path.join(file_path, history_agg_file), archive_path)
|
|
|
|
|
|
+ return
|
|
|
# 缓存关键词位置
|
|
|
word_file = os.path.join(file_path, FILE_LONG_TAIL_MERGE)
|
|
|
word_dict = {}
|