Просмотр исходного кода

完善git配置文件;增加长尾词合并功能

ChenYL 2 лет назад
Родитель
Сommit
8143075eef
2 измененных файлов с 44 добавлено и 6 удалено
  1. 1 0
      .gitignore
  2. 43 6
      src/money.py

+ 1 - 0
.gitignore

@@ -164,3 +164,4 @@ cython_debug/
 
 tmp/
 !src/tmp
+data/

+ 43 - 6
src/money.py

@@ -5,6 +5,12 @@ import utils
 import jieba
 import zipfile
 
+# 文件后缀:长尾词.txt
+FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
+
+# 文件后缀:长尾词_合并.txt
+FILE_SUFFIX_LONG_TAIL_MERGE = "_长尾词_合并.txt"
+
 
 def extract_word_from_5118(file_path: str):
     """
@@ -40,13 +46,41 @@ def extract_word_from_5118(file_path: str):
                     word_container.add(split[0])
 
             output_file_name = real_name[0:real_name.index("--")]
-            output_file_path = os.path.join(file_path, output_file_name + "_长尾词.txt")
+            output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
             with open(output_file_path, "w", encoding="utf-8") as f:
                 for item in word_container:
                     f.write(item)
                     f.write("\n")
 
 
+def merge_word(file_path: str):
+    """
+    合并长尾词(带去重)
+    :param file_path: 待处理文件夹路径
+    :return: None
+    """
+    # 获取文件列表
+    file_list = []
+    for file in os.listdir(file_path):
+        if file.endswith(FILE_SUFFIX_LONG_TAIL):
+            file_list.append(os.path.join(file_path, file))
+
+    # 长尾词集合容器
+    word_set = set()
+
+    # 读取数据并排重
+    for i, file in enumerate(file_list):
+        with open(file, "r", encoding="utf-8") as f:
+            for word in f:
+                word_set.add(word.replace("\n", ""))
+
+    # 保存合并结果
+    with open(os.path.join(file_path, str(len(file_list)) + FILE_SUFFIX_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
+        for item in word_set:
+            f.write(item)
+            f.write("\n")
+
+
 def word_split_statistics(file_path: str):
     """
     分词统计
@@ -61,7 +95,7 @@ def word_split_statistics(file_path: str):
     stop_word_dict = utils.load_stop_word()
 
     for i, file in enumerate(file_list):
-        if not file.endswith("_长尾词.txt"):
+        if not file.endswith(FILE_SUFFIX_LONG_TAIL):
             continue
 
         # 分词结果容器
@@ -85,14 +119,17 @@ def word_split_statistics(file_path: str):
         # 根据词频进行倒序排列
         sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
 
-        output_file_name = file[file.rindex("\\")+1:file.index("_长尾词.txt")]
+        output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
         output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
         with open(output_file_path, "w", encoding="UTF-8") as f:
             for key, count in sorted_key_list:
                 f.write("%s,%d\n" % (key, count))
 
+
 if __name__ == "__main__":
-    file_path = "E:\Download\原始词库"
-    # file_path = "E:\Download\测试"
-    extract_word_from_5118(file_path)
+    filePath = "../data"
+    # filePath = "E:\Download\测试"
+    # extract_word_from_5118(filePath)
+    merge_word(filePath)
     # word_split_statistics(file_path)
+