cut_statistics.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. # -*- coding:utf-8 -*-
  2. import os
  3. import jieba
  4. # 待处理的数据文件
  5. DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv"
  6. # 输出的结果文件
  7. CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv"
  8. # 文件编码格式
  9. ENCODING_CHARSET = "UTF-8"
  10. # 停用词
  11. STOP_WORD_DIR = "./data/stopwords"
  12. # 间隔进度提示
  13. INTERNAL_NUM = 50000
  14. def cut_word_and_statistics(data):
  15. """
  16. 分词并统计词频
  17. """
  18. print("开始执行分词操作并进行词频统计")
  19. total_num = len(data)
  20. print("共需处理数据:%d" % total_num)
  21. # 分词结果容器
  22. key_dict = {}
  23. # 跳过开头两行
  24. for i, item in enumerate(data):
  25. # 只需要第一列的数据
  26. longTailKey = item.split(",")[0]
  27. longTailKey = longTailKey.replace("\n", "")
  28. # 分词
  29. cutWord = jieba.cut_for_search(longTailKey)
  30. # 统计
  31. for word in cutWord:
  32. if word in key_dict:
  33. key_dict[word] = key_dict[word] + 1
  34. else:
  35. key_dict[word] = 1
  36. # 进度提示
  37. if i % INTERNAL_NUM == 0:
  38. print("当前分词进度 %d / %d" % (i, total_num))
  39. print("根据词频进行倒序排列")
  40. # 根据词频倒序排列
  41. sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
  42. print("分词结束")
  43. return sorted_key_dict
  44. def load_stop_words():
  45. """
  46. 加载停用词列表
  47. """
  48. print("加载停用词 - 开始")
  49. # 停用词容器
  50. stop_word = []
  51. stop_word_files = os.listdir(STOP_WORD_DIR)
  52. for file in stop_word_files:
  53. stop_word_file = os.path.join(STOP_WORD_DIR, file)
  54. with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
  55. for item in f:
  56. stop_word.append(item.replace("\n",""))
  57. print("去重前,停用词数量:", len(stop_word))
  58. stop_word = list(set(stop_word))
  59. print("去重后,停用词数量:", len(stop_word))
  60. print("加载停用词 - 结束")
  61. return stop_word
  62. def filter_stop_word(word_root: dict) :
  63. """
  64. 对分词结果过滤停用词
  65. """
  66. print("过滤停用词 - 开始")
  67. # 加载停用词
  68. stop_word = load_stop_words()
  69. print("过滤前,总分词数量:%d" % len(word_root))
  70. # 过滤停用词
  71. word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word)
  72. print("过滤后,总分词数量:%d" % len(word_root_filter))
  73. print("过滤停用词 - 结束")
  74. return word_root_filter
  75. def main():
  76. print("开始")
  77. if not os.path.exists(DATA_KEYWORD_FILE):
  78. raise Exception("待处理的数据文件不存在:%s" % DATA_KEYWORD_FILE)
  79. # 读取数据
  80. print("从待处理的数据文件中读取数据")
  81. lines = None
  82. with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
  83. lines = f.readlines()
  84. # 执行分词和词频统计
  85. word_root = cut_word_and_statistics(lines[:100])
  86. # 过滤停用词
  87. word_root_filter = filter_stop_word(word_root)
  88. # 导出过滤后的数据,不要表头和行号
  89. # print("导出过滤后的结果")
  90. # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False)
  91. with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
  92. for item in word_root_filter.items():
  93. f.write("%s,%d\n" % item)
  94. print("结束")
  95. if __name__ == '__main__':
  96. main()