|
@@ -127,17 +127,56 @@ def word_split_statistics(file_path: str):
|
|
|
f.write("%s,%d\n" % (key, count))
|
|
f.write("%s,%d\n" % (key, count))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def sort_file_content(file_path: str):
|
|
|
|
|
+ """
|
|
|
|
|
+ 对聚合后的文件内容进行排序重写
|
|
|
|
|
+ :param file_path:
|
|
|
|
|
+ :return:
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ target_path = os.path.join(file_path, "长尾词_合并_聚合.txt")
|
|
|
|
|
+ if os.path.exists(target_path) and not os.path.isfile(target_path):
|
|
|
|
|
+ print("文件不存在! " + target_path)
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ result = []
|
|
|
|
|
+ tmp_result = []
|
|
|
|
|
+ count = 0;
|
|
|
|
|
+ with (open(target_path, "r", encoding="UTF-8") as fr,
|
|
|
|
|
+ open(target_path.replace(".txt", "排序.txt"), "w", encoding="UTF-8") as fw):
|
|
|
|
|
+ for line in fr.readlines():
|
|
|
|
|
+ if line.startswith("\n"):
|
|
|
|
|
+ if not tmp_result:
|
|
|
|
|
+ continue
|
|
|
|
|
+ else:
|
|
|
|
|
+ result.append((count, tmp_result))
|
|
|
|
|
+ tmp_result = []
|
|
|
|
|
+ count = 0
|
|
|
|
|
+ else:
|
|
|
|
|
+ count = count + 1
|
|
|
|
|
+ tmp_result.append(line)
|
|
|
|
|
+
|
|
|
|
|
+ result = sorted(result, key=lambda x: x[0], reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ for i, tmp_l in result:
|
|
|
|
|
+ for l in tmp_l:
|
|
|
|
|
+ fw.write(l)
|
|
|
|
|
+ fw.write("\n")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|
|
print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|
|
|
# filePath = "../data"
|
|
# filePath = "../data"
|
|
|
filePath = "../data/test"
|
|
filePath = "../data/test"
|
|
|
# extract_word_from_5118(filePath)
|
|
# extract_word_from_5118(filePath)
|
|
|
# merge_word(filePath)
|
|
# merge_word(filePath)
|
|
|
- prepare_word_split_and_reverse_index(filePath)
|
|
|
|
|
|
|
+ # prepare_word_split_and_reverse_index(filePath)
|
|
|
# agg_word(filePath)
|
|
# agg_word(filePath)
|
|
|
# word_split_statistics(file_path)
|
|
# word_split_statistics(file_path)
|
|
|
# tasks = utils.avg_split_task(100, 12, 1)
|
|
# tasks = utils.avg_split_task(100, 12, 1)
|
|
|
# 两者计算余弦值等于:0.8
|
|
# 两者计算余弦值等于:0.8
|
|
|
# val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
|
|
# val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
|
|
|
# ["QQ", "邮箱", "格式", "如何", "写"])
|
|
# ["QQ", "邮箱", "格式", "如何", "写"])
|
|
|
|
|
+ sort_file_content(filePath)
|
|
|
print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|
|
print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|