2 سال پیش · 9d707324a5
--- a/src/money.py
+++ b/src/money.py
@@ -127,17 +127,56 @@ def word_split_statistics(file_path: str):
 
															                 f.write("%s,%d\n" % (key, count))
														
 
															+def sort_file_content(file_path: str):
														
 
															+    """
														
 
															+    对聚合后的文件内容进行排序重写
														
 
															+    :param file_path:
														
 
															+    :return:
														
 
															+    """
														
 
															+
														
 
															+    target_path = os.path.join(file_path, "长尾词_合并_聚合.txt")
														
 
															+    if os.path.exists(target_path) and not os.path.isfile(target_path):
														
 
															+        print("文件不存在！ " + target_path)
														
 
															+        return
														
 
															+
														
 
															+    result = []
														
 
															+    tmp_result = []
														
 
															+    count = 0;
														
 
															+    with (open(target_path, "r", encoding="UTF-8") as fr,
														
 
															+        open(target_path.replace(".txt", "排序.txt"), "w", encoding="UTF-8") as fw):
														
 
															+        for line in fr.readlines():
														
 
															+            if line.startswith("\n"):
														
 
															+                if not tmp_result:
														
 
															+                    continue
														
 
															+                else:
														
 
															+                    result.append((count, tmp_result))
														
 
															+                    tmp_result = []
														
 
															+                    count = 0
														
 
															+            else:
														
 
															+                count = count + 1
														
 
															+                tmp_result.append(line)
														
 
															+
														
 
															+        result = sorted(result, key=lambda x: x[0], reverse=True)
														
 
															+
														
 
															+        for i, tmp_l in result:
														
 
															+            for l in tmp_l:
														
 
															+                fw.write(l)
														
 
															+            fw.write("\n")
														
 
															+
														
 
															+
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															     print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
														
 
															     # filePath = "../data"
														
 
															     filePath = "../data/test"
														
 
															     # extract_word_from_5118(filePath)
														
 
															     # merge_word(filePath)
														
 
															-    prepare_word_split_and_reverse_index(filePath)
														
 
															+    # prepare_word_split_and_reverse_index(filePath)
														
 
															     # agg_word(filePath)
														
 
															     # word_split_statistics(file_path)
														
 
															     # tasks = utils.avg_split_task(100, 12, 1)
														
 
															     # 两者计算余弦值等于：0.8
														
 
															     # val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
														
 
															     #                         ["QQ", "邮箱", "格式", "如何", "写"])
														
 
															+    sort_file_content(filePath)
														
 
															     print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))