Browse Source

feat:增加聚合文件再排序

ChenYL 2 years ago
parent
commit
9d707324a5
1 changed files with 40 additions and 1 deletions
  1. 40 1
      src/money.py

+ 40 - 1
src/money.py

@@ -127,17 +127,56 @@ def word_split_statistics(file_path: str):
                 f.write("%s,%d\n" % (key, count))
 
 
+def sort_file_content(file_path: str):
+    """
+    对聚合后的文件内容进行排序重写
+    :param file_path:
+    :return:
+    """
+
+    target_path = os.path.join(file_path, "长尾词_合并_聚合.txt")
+    if os.path.exists(target_path) and not os.path.isfile(target_path):
+        print("文件不存在! " + target_path)
+        return
+
+    result = []
+    tmp_result = []
+    count = 0;
+    with (open(target_path, "r", encoding="UTF-8") as fr,
+        open(target_path.replace(".txt", "排序.txt"), "w", encoding="UTF-8") as fw):
+        for line in fr.readlines():
+            if line.startswith("\n"):
+                if not tmp_result:
+                    continue
+                else:
+                    result.append((count, tmp_result))
+                    tmp_result = []
+                    count = 0
+            else:
+                count = count + 1
+                tmp_result.append(line)
+
+        result = sorted(result, key=lambda x: x[0], reverse=True)
+
+        for i, tmp_l in result:
+            for l in tmp_l:
+                fw.write(l)
+            fw.write("\n")
+
+
+
 if __name__ == "__main__":
     print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
     # filePath = "../data"
     filePath = "../data/test"
     # extract_word_from_5118(filePath)
     # merge_word(filePath)
-    prepare_word_split_and_reverse_index(filePath)
+    # prepare_word_split_and_reverse_index(filePath)
     # agg_word(filePath)
     # word_split_statistics(file_path)
     # tasks = utils.avg_split_task(100, 12, 1)
     # 两者计算余弦值等于:0.8
     # val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
     #                         ["QQ", "邮箱", "格式", "如何", "写"])
+    sort_file_content(filePath)
     print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))