|
|
@@ -148,6 +148,17 @@ def merge_word(data_path: str, txt_files: list):
|
|
|
# 长尾词集合容器
|
|
|
word_set = set()
|
|
|
|
|
|
+ # 读取数据目录下是否还有其他长尾词文件
|
|
|
+ files = os.listdir(data_path)
|
|
|
+ if files:
|
|
|
+ for file in files:
|
|
|
+ if file.endswith(WORD_FILE_SUFFIX):
|
|
|
+ txt_files.append(os.path.join(data_path, file))
|
|
|
+
|
|
|
+ # 如果没有待处理的文件列表,返回
|
|
|
+ if not txt_files:
|
|
|
+ return False
|
|
|
+
|
|
|
# 读取数据并排重
|
|
|
for i, file in enumerate(txt_files):
|
|
|
with open(file, "r", encoding="utf-8") as f:
|
|
|
@@ -182,7 +193,7 @@ def word_split_and_reverse_index(data_path: str):
|
|
|
|
|
|
if total_line_num == 0:
|
|
|
print("没有待处理的数据,文本量为0")
|
|
|
- return True
|
|
|
+ return False
|
|
|
|
|
|
# 分割任务数量
|
|
|
task_list = utils.avg_split_task(total_line_num, math.ceil(total_line_num / os.cpu_count()))
|
|
|
@@ -324,6 +335,7 @@ def sort_file_content(data_path: str):
|
|
|
if not tmp_result:
|
|
|
continue
|
|
|
else:
|
|
|
+ tmp_result = sorted(tmp_result, key=lambda x: len(x), reverse=True)
|
|
|
result.append((count, tmp_result))
|
|
|
tmp_result = []
|
|
|
count = 0
|