cut.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import os
  4. import tools
  5. import jieba
  6. import logging
  7. import logging.config
  8. from stop_word import load_stop_word
  9. # 待处理的数据文件
  10. INPUT_FILE = "E:\Download\怎么长尾词_1655561719.csv"
  11. # 处理输出文件
  12. OUTPUT_FILE = "E:\Download\长尾关键词\怎么长尾词_分词统计.csv"
  13. def cut_word_and_statistics(data):
  14. """
  15. 分词并统计词频
  16. """
  17. logging.info("开始执行分词操作并进行词频统计")
  18. # 分词结果容器
  19. key_dict = {}
  20. # 停用词
  21. stop_word = load_stop_word()
  22. # 待处理数据总数量
  23. total_num = len(data)
  24. logging.info("共需处理 %d 条数据" % total_num)
  25. for i, item in enumerate(data):
  26. # 只需要第一列的数据
  27. longTailKey = item.split(",")[0]
  28. # 移除换行符
  29. longTailKey = longTailKey.replace("\n", "")
  30. # 分词
  31. cutWord = jieba.cut_for_search(longTailKey)
  32. # 统计
  33. for word in cutWord:
  34. # 过滤停用词
  35. if word in stop_word:
  36. continue
  37. if word in key_dict:
  38. key_dict[word] = key_dict[word] + 1
  39. else:
  40. key_dict[word] = 1
  41. # 进度提示
  42. tools.tip(total_num, i)
  43. # 根据词频倒序排列
  44. logging.info("根据词频进行倒序排列")
  45. sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
  46. logging.info("分词操作并进行词频统计 结束")
  47. return sorted_key_list
  48. def main(orig_file, dest_file):
  49. if not os.path.exists(orig_file):
  50. logging.warning("待处理的数据文件不存在:%s" % orig_file)
  51. return
  52. # 读取数据
  53. logging.info("正在读取待处理的数据文件:%s" % orig_file)
  54. lines = None
  55. with open(orig_file, "r", encoding=config.ENCODING_CHARSET) as f:
  56. lines = f.readlines()
  57. # 执行分词和词频统计(跳过前两行)
  58. word_root_list = cut_word_and_statistics(lines[2:])
  59. # 导出数据
  60. logging.info("正在导出分词数据,位置:%s" % dest_file)
  61. with open(dest_file, "w", encoding=config.ENCODING_CHARSET) as f:
  62. for key, count in word_root_list:
  63. f.write("%s,%d\n" % (key, count))
  64. if __name__ == '__main__':
  65. TITLE = "分词处理"
  66. # 日志初始化
  67. tools.init_log()
  68. tools.log_start_msg(TITLE)
  69. main()
  70. tools.log_end_msg(TITLE)