2 лет назад · 8143075eef
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,4 @@ cython_debug/
 
				 
			
 
				 tmp/
			
 
				 !src/tmp
			
 
				+data/
			
--- a/src/money.py
+++ b/src/money.py
@@ -5,6 +5,12 @@ import utils
 
				 import jieba
			
 
				 import zipfile
			
 
				 
			
 
				+# 文件后缀：长尾词.txt
			
 
				+FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
			
 
				+
			
 
				+# 文件后缀：长尾词_合并.txt
			
 
				+FILE_SUFFIX_LONG_TAIL_MERGE = "_长尾词_合并.txt"
			
 
				+
			
 
				 
			
 
				 def extract_word_from_5118(file_path: str):
			
 
				     """
			
@@ -40,13 +46,41 @@ def extract_word_from_5118(file_path: str):
 
				                     word_container.add(split[0])
			
 
				 
			
 
				             output_file_name = real_name[0:real_name.index("--")]
			
 
				-            output_file_path = os.path.join(file_path, output_file_name + "_长尾词.txt")
			
 
				+            output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
			
 
				             with open(output_file_path, "w", encoding="utf-8") as f:
			
 
				                 for item in word_container:
			
 
				                     f.write(item)
			
 
				                     f.write("\n")
			
 
				 
			
 
				 
			
 
				+def merge_word(file_path: str):
			
 
				+    """
			
 
				+    合并长尾词（带去重）
			
 
				+    :param file_path: 待处理文件夹路径
			
 
				+    :return: None
			
 
				+    """
			
 
				+    # 获取文件列表
			
 
				+    file_list = []
			
 
				+    for file in os.listdir(file_path):
			
 
				+        if file.endswith(FILE_SUFFIX_LONG_TAIL):
			
 
				+            file_list.append(os.path.join(file_path, file))
			
 
				+
			
 
				+    # 长尾词集合容器
			
 
				+    word_set = set()
			
 
				+
			
 
				+    # 读取数据并排重
			
 
				+    for i, file in enumerate(file_list):
			
 
				+        with open(file, "r", encoding="utf-8") as f:
			
 
				+            for word in f:
			
 
				+                word_set.add(word.replace("\n", ""))
			
 
				+
			
 
				+    # 保存合并结果
			
 
				+    with open(os.path.join(file_path, str(len(file_list)) + FILE_SUFFIX_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
			
 
				+        for item in word_set:
			
 
				+            f.write(item)
			
 
				+            f.write("\n")
			
 
				+
			
 
				+
			
 
				 def word_split_statistics(file_path: str):
			
 
				     """
			
 
				     分词统计
			
@@ -61,7 +95,7 @@ def word_split_statistics(file_path: str):
 
				     stop_word_dict = utils.load_stop_word()
			
 
				 
			
 
				     for i, file in enumerate(file_list):
			
 
				-        if not file.endswith("_长尾词.txt"):
			
 
				+        if not file.endswith(FILE_SUFFIX_LONG_TAIL):
			
 
				             continue
			
 
				 
			
 
				         # 分词结果容器
			
@@ -85,14 +119,17 @@ def word_split_statistics(file_path: str):
 
				         # 根据词频进行倒序排列
			
 
				         sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
			
 
				 
			
 
				-        output_file_name = file[file.rindex("\\")+1:file.index("_长尾词.txt")]
			
 
				+        output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
			
 
				         output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
			
 
				         with open(output_file_path, "w", encoding="UTF-8") as f:
			
 
				             for key, count in sorted_key_list:
			
 
				                 f.write("%s,%d\n" % (key, count))
			
 
				 
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				-    file_path = "E:\Download\原始词库"
			
 
				-    # file_path = "E:\Download\测试"
			
 
				-    extract_word_from_5118(file_path)
			
 
				+    filePath = "../data"
			
 
				+    # filePath = "E:\Download\测试"
			
 
				+    # extract_word_from_5118(filePath)
			
 
				+    merge_word(filePath)
			
 
				     # word_split_statistics(file_path)
			
 
				+