cut.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # -*- coding:utf-8 -*-
  2. import config
  3. import os
  4. import tools
  5. import jieba
  6. import logging
  7. import logging.config
  8. from stop_word import load_stop_word
  9. TITLE = "分词处理"
  10. # 待处理的数据文件
  11. DATA_FILE = "E:\Download\怎么长尾词_1655561719.csv"
  12. def cut_word_and_statistics(data):
  13. """
  14. 分词并统计词频
  15. """
  16. logging.info("开始执行分词操作并进行词频统计")
  17. # 分词结果容器
  18. key_dict = {}
  19. # 停用词
  20. stop_word = load_stop_word()
  21. # 待处理数据总数量
  22. total_num = len(data)
  23. logging.info("共需处理 %d 条数据" % total_num)
  24. for i, item in enumerate(data):
  25. # 只需要第一列的数据
  26. longTailKey = item.split(",")[0]
  27. # 移除换行符
  28. longTailKey = longTailKey.replace("\n", "")
  29. # 分词
  30. cutWord = jieba.cut_for_search(longTailKey)
  31. # 统计
  32. for word in cutWord:
  33. # 过滤停用词
  34. if word in stop_word:
  35. continue
  36. if word in key_dict:
  37. key_dict[word] = key_dict[word] + 1
  38. else:
  39. key_dict[word] = 1
  40. # 进度提示
  41. tools.tip(total_num, i)
  42. # 根据词频倒序排列
  43. logging.info("根据词频进行倒序排列")
  44. sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
  45. logging.info("分词操作并进行词频统计 结束")
  46. return sorted_key_list
  47. def main():
  48. # 日志初始化
  49. tools.init_log()
  50. tools.log_start_msg(TITLE)
  51. if not os.path.exists(DATA_FILE):
  52. logging.warning("待处理的数据文件不存在:%s" % DATA_FILE)
  53. return
  54. # 读取数据
  55. logging.info("正在读取待处理的数据文件:%s" % DATA_FILE)
  56. lines = None
  57. with open(DATA_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
  58. lines = f.readlines()
  59. # 执行分词和词频统计(跳过前两行)
  60. word_root_list = cut_word_and_statistics(lines[2:])
  61. # 导出数据
  62. logging.info("正在导出分词数据,位置:%s" % config.CUT_FILE)
  63. with open(config.CUT_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
  64. for key, count in word_root_list:
  65. f.write("%s,%d\n" % (key, count))
  66. tools.log_end_msg(TITLE)
  67. if __name__ == '__main__':
  68. main()