money.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. # -*- coding:utf-8 -*-
  2. import os
  3. import time
  4. import zipfile
  5. import jieba
  6. import utils
  7. from agg import prepare_word_split_and_reverse_index
  8. from constant import FILE_LONG_TAIL_MERGE
  9. # 文件后缀:长尾词.txt
  10. FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
  11. def extract_word_from_5118(file_path: str):
  12. """
  13. 从5118关键词压缩文件中提取数据
  14. :param file_path: 待处理文件夹路径
  15. :return: None
  16. """
  17. file_list = []
  18. for file in os.listdir(file_path):
  19. file_list.append(os.path.join(file_path, file))
  20. for i, file in enumerate(file_list):
  21. zfile = zipfile.ZipFile(file)
  22. filenames = zfile.namelist()
  23. for filename in filenames:
  24. # 重新编码文件名为正确形式
  25. real_name = filename.encode('cp437').decode('gbk')
  26. # 排除无效文件
  27. if real_name in ['打开乱码如何处理?.txt']:
  28. continue
  29. # 关键词存放容器
  30. word_container = set()
  31. # 读取压缩文件中的文件
  32. with zfile.open(filename) as file_content:
  33. lines = file_content.readlines()
  34. # 跳过开头两行
  35. for line in lines[2:]:
  36. split = line.decode("gbk").split(",")
  37. # 只需要第一列的数据
  38. word_container.add(split[0])
  39. output_file_name = real_name[0:real_name.index("--")]
  40. output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
  41. with open(output_file_path, "w", encoding="utf-8") as f:
  42. for item in word_container:
  43. f.write(item)
  44. f.write("\n")
  45. def merge_word(file_path: str):
  46. """
  47. 合并长尾词(带去重)
  48. :param file_path: 待处理文件夹路径
  49. :return: None
  50. """
  51. # 获取文件列表
  52. file_list = []
  53. for file in os.listdir(file_path):
  54. if file.endswith(FILE_SUFFIX_LONG_TAIL):
  55. file_list.append(os.path.join(file_path, file))
  56. # 长尾词集合容器
  57. word_set = set()
  58. # 读取数据并排重
  59. for i, file in enumerate(file_list):
  60. with open(file, "r", encoding="utf-8") as f:
  61. for word in f:
  62. word_set.add(word.replace("\n", ""))
  63. # 保存合并结果
  64. with open(os.path.join(file_path, FILE_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
  65. for item in word_set:
  66. f.write(item)
  67. f.write("\n")
  68. def word_split_statistics(file_path: str):
  69. """
  70. 分词统计
  71. :param file_path: 待处理文件夹路径
  72. :return: None
  73. """
  74. file_list = []
  75. for file in os.listdir(file_path):
  76. file_list.append(os.path.join(file_path, file))
  77. stop_word_dict = utils.load_stop_word()
  78. for i, file in enumerate(file_list):
  79. if not file.endswith(FILE_SUFFIX_LONG_TAIL):
  80. continue
  81. # 分词结果容器
  82. key_dict = {}
  83. with open(file, "r", encoding="utf-8") as f:
  84. for tmp_word in f:
  85. # 分词
  86. word_list = jieba.cut_for_search(tmp_word.replace("\n", ""))
  87. # 统计
  88. for word in word_list:
  89. # 过滤停用词
  90. if word in stop_word_dict:
  91. continue
  92. if word in key_dict:
  93. key_dict[word] = key_dict[word] + 1
  94. else:
  95. key_dict[word] = 1
  96. # 根据词频进行倒序排列
  97. sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
  98. output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
  99. output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
  100. with open(output_file_path, "w", encoding="UTF-8") as f:
  101. for key, count in sorted_key_list:
  102. f.write("%s,%d\n" % (key, count))
  103. if __name__ == "__main__":
  104. print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
  105. # filePath = "../data"
  106. filePath = "../data/test"
  107. # extract_word_from_5118(filePath)
  108. # merge_word(filePath)
  109. prepare_word_split_and_reverse_index(filePath)
  110. # agg_word(filePath)
  111. # word_split_statistics(file_path)
  112. # tasks = utils.avg_split_task(100, 12, 1)
  113. # 两者计算余弦值等于:0.8
  114. # val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
  115. # ["QQ", "邮箱", "格式", "如何", "写"])
  116. print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))