cut.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import os
  4. import tools
  5. import jieba
  6. import logging
  7. import logging.config
  8. # 待处理的数据文件
  9. INPUT_FILE = "E:\Download\怎么长尾词_1655561719.csv"
  10. # 处理输出文件
  11. OUTPUT_FILE = "E:\Download\长尾关键词\怎么长尾词_分词统计.csv"
  12. def cut_word_and_statistics(data):
  13. """
  14. 分词并统计词频
  15. """
  16. logging.info("开始执行分词操作并进行词频统计")
  17. # 分词结果容器
  18. key_dict = {}
  19. # 停用词
  20. stop_word = tools.load_stop_word()
  21. # 待处理数据总数量
  22. total_num = len(data)
  23. logging.info("共需处理 %d 条数据" % total_num)
  24. for i, item in enumerate(data):
  25. # 只需要第一列的数据
  26. longTailKey = item.split(",")[0]
  27. # 移除换行符
  28. longTailKey = longTailKey.replace("\n", "")
  29. # 分词
  30. cutWord = jieba.cut_for_search(longTailKey)
  31. # 统计
  32. for word in cutWord:
  33. # 过滤停用词
  34. if word in stop_word:
  35. continue
  36. if word in key_dict:
  37. key_dict[word] = key_dict[word] + 1
  38. else:
  39. key_dict[word] = 1
  40. # 进度提示
  41. tools.tip(total_num, i)
  42. # 根据词频倒序排列
  43. logging.info("根据词频进行倒序排列")
  44. sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
  45. logging.info("分词操作并进行词频统计 结束")
  46. return sorted_key_list
  47. def main(orig_file, dest_file):
  48. if not os.path.exists(orig_file):
  49. logging.warning("待处理的数据文件不存在:%s" % orig_file)
  50. return
  51. # 读取数据
  52. logging.info("正在读取待处理的数据文件:%s" % orig_file)
  53. lines = None
  54. with open(orig_file, "r", encoding=config.ENCODING_CHARSET) as f:
  55. lines = f.readlines()
  56. # 执行分词和词频统计(跳过前两行)
  57. word_root_list = cut_word_and_statistics(lines[2:])
  58. # 导出数据
  59. logging.info("正在导出分词数据,位置:%s" % dest_file)
  60. with open(dest_file, "w", encoding=config.ENCODING_CHARSET) as f:
  61. for key, count in word_root_list:
  62. f.write("%s,%d\n" % (key, count))
  63. if __name__ == '__main__':
  64. TITLE = "分词处理"
  65. # 日志初始化
  66. tools.init_log()
  67. tools.log_start_msg(TITLE)
  68. main()
  69. tools.log_end_msg(TITLE)