2 vuotta sitten · 8143075eef
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,4 @@ cython_debug/
 
															 tmp/
														
 
															 !src/tmp
														
 
															+data/
														
--- a/src/money.py
+++ b/src/money.py
@@ -5,6 +5,12 @@ import utils
 
															 import jieba
														
 
															 import zipfile
														
 
															+# 文件后缀：长尾词.txt
														
 
															+FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
														
 
															+
														
 
															+# 文件后缀：长尾词_合并.txt
														
 
															+FILE_SUFFIX_LONG_TAIL_MERGE = "_长尾词_合并.txt"
														
 
															+
														
 
															 def extract_word_from_5118(file_path: str):
														
 
															     """
														
@@ -40,13 +46,41 @@ def extract_word_from_5118(file_path: str):
 
															                     word_container.add(split[0])
														
 
															             output_file_name = real_name[0:real_name.index("--")]
														
 
															-            output_file_path = os.path.join(file_path, output_file_name + "_长尾词.txt")
														
 
															+            output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
														
 
															             with open(output_file_path, "w", encoding="utf-8") as f:
														
 
															                 for item in word_container:
														
 
															                     f.write(item)
														
 
															                     f.write("\n")
														
 
															+def merge_word(file_path: str):
														
 
															+    """
														
 
															+    合并长尾词（带去重）
														
 
															+    :param file_path: 待处理文件夹路径
														
 
															+    :return: None
														
 
															+    """
														
 
															+    # 获取文件列表
														
 
															+    file_list = []
														
 
															+    for file in os.listdir(file_path):
														
 
															+        if file.endswith(FILE_SUFFIX_LONG_TAIL):
														
 
															+            file_list.append(os.path.join(file_path, file))
														
 
															+
														
 
															+    # 长尾词集合容器
														
 
															+    word_set = set()
														
 
															+
														
 
															+    # 读取数据并排重
														
 
															+    for i, file in enumerate(file_list):
														
 
															+        with open(file, "r", encoding="utf-8") as f:
														
 
															+            for word in f:
														
 
															+                word_set.add(word.replace("\n", ""))
														
 
															+
														
 
															+    # 保存合并结果
														
 
															+    with open(os.path.join(file_path, str(len(file_list)) + FILE_SUFFIX_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
														
 
															+        for item in word_set:
														
 
															+            f.write(item)
														
 
															+            f.write("\n")
														
 
															+
														
 
															+
														
 
															 def word_split_statistics(file_path: str):
														
 
															     """
														
 
															     分词统计
														
@@ -61,7 +95,7 @@ def word_split_statistics(file_path: str):
 
															     stop_word_dict = utils.load_stop_word()
														
 
															     for i, file in enumerate(file_list):
														
 
															-        if not file.endswith("_长尾词.txt"):
														
 
															+        if not file.endswith(FILE_SUFFIX_LONG_TAIL):
														
 
															             continue
														
 
															         # 分词结果容器
														
@@ -85,14 +119,17 @@ def word_split_statistics(file_path: str):
 
															         # 根据词频进行倒序排列
														
 
															         sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
														
 
															-        output_file_name = file[file.rindex("\\")+1:file.index("_长尾词.txt")]
														
 
															+        output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
														
 
															         output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
														
 
															         with open(output_file_path, "w", encoding="UTF-8") as f:
														
 
															             for key, count in sorted_key_list:
														
 
															                 f.write("%s,%d\n" % (key, count))
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															-    file_path = "E:\Download\原始词库"
														
 
															-    # file_path = "E:\Download\测试"
														
 
															-    extract_word_from_5118(file_path)
														
 
															+    filePath = "../data"
														
 
															+    # filePath = "E:\Download\测试"
														
 
															+    # extract_word_from_5118(filePath)
														
 
															+    merge_word(filePath)
														
 
															     # word_split_statistics(file_path)
														
 
															+