# -*- coding:utf-8 -*- import os import jieba # 待处理的数据文件 DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv" # 输出的结果文件 CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv" # 文件编码格式 ENCODING_CHARSET = "UTF-8" # 停用词 STOP_WORD_DIR = "./data/stopwords" # 间隔进度提示 INTERNAL_NUM = 50000 def cut_word_and_statistics(data): """ 分词并统计词频 """ print("开始执行分词操作并进行词频统计") total_num = len(data) print("共需处理数据:%d" % total_num) # 分词结果容器 key_dict = {} # 跳过开头两行 for i, item in enumerate(data): # 只需要第一列的数据 longTailKey = item.split(",")[0] longTailKey = longTailKey.replace("\n", "") # 分词 cutWord = jieba.cut_for_search(longTailKey) # 统计 for word in cutWord: if word in key_dict: key_dict[word] = key_dict[word] + 1 else: key_dict[word] = 1 # 进度提示 if i % INTERNAL_NUM == 0: print("当前分词进度 %d / %d" % (i, total_num)) print("根据词频进行倒序排列") # 根据词频倒序排列 sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True) print("分词结束") return sorted_key_dict def load_stop_words(): """ 加载停用词列表 """ print("加载停用词 - 开始") # 停用词容器 stop_word = [] stop_word_files = os.listdir(STOP_WORD_DIR) for file in stop_word_files: stop_word_file = os.path.join(STOP_WORD_DIR, file) with open(stop_word_file, encoding=ENCODING_CHARSET) as f: for item in f: stop_word.append(item.replace("\n","")) print("去重前,停用词数量:", len(stop_word)) stop_word = list(set(stop_word)) print("去重后,停用词数量:", len(stop_word)) print("加载停用词 - 结束") return stop_word def filter_stop_word(word_root: dict) : """ 对分词结果过滤停用词 """ print("过滤停用词 - 开始") # 加载停用词 stop_word = load_stop_words() print("过滤前,总分词数量:%d" % len(word_root)) # 过滤停用词 word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word) print("过滤后,总分词数量:%d" % len(word_root_filter)) print("过滤停用词 - 结束") return word_root_filter def main(): print("开始") if not os.path.exists(DATA_KEYWORD_FILE): raise Exception("待处理的数据文件不存在:%s" % DATA_KEYWORD_FILE) # 读取数据 print("从待处理的数据文件中读取数据") lines = None with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f: lines = f.readlines() # 执行分词和词频统计 word_root = cut_word_and_statistics(lines[:100]) # 过滤停用词 word_root_filter = filter_stop_word(word_root) # 导出过滤后的数据,不要表头和行号 # print("导出过滤后的结果") # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False) with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f: for item in word_root_filter.items(): f.write("%s,%d\n" % item) print("结束") if __name__ == '__main__': main()