# -*- coding:utf-8 -*- import config import os import tools import jieba import logging import logging.config from stop_word import load_stop_word TITLE = "分词处理" # 待处理的数据文件 DATA_FILE = "E:\Download\怎么长尾词_1655561719.csv" def cut_word_and_statistics(data): """ 分词并统计词频 """ logging.info("开始执行分词操作并进行词频统计") # 分词结果容器 key_dict = {} # 停用词 stop_word = load_stop_word() # 待处理数据总数量 total_num = len(data) logging.info("共需处理 %d 条数据" % total_num) for i, item in enumerate(data): # 只需要第一列的数据 longTailKey = item.split(",")[0] # 移除换行符 longTailKey = longTailKey.replace("\n", "") # 分词 cutWord = jieba.cut_for_search(longTailKey) # 统计 for word in cutWord: # 过滤停用词 if word in stop_word: continue if word in key_dict: key_dict[word] = key_dict[word] + 1 else: key_dict[word] = 1 # 进度提示 tools.tip(total_num, i) # 根据词频倒序排列 logging.info("根据词频进行倒序排列") sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True) logging.info("分词操作并进行词频统计 结束") return sorted_key_list def main(): # 日志初始化 tools.init_log() tools.log_start_msg(TITLE) if not os.path.exists(DATA_FILE): logging.warning("待处理的数据文件不存在:%s" % DATA_FILE) return # 读取数据 logging.info("正在读取待处理的数据文件:%s" % DATA_FILE) lines = None with open(DATA_FILE, "r", encoding=config.ENCODING_CHARSET) as f: lines = f.readlines() # 执行分词和词频统计(跳过前两行) word_root_list = cut_word_and_statistics(lines[2:]) # 导出数据 logging.info("正在导出分词数据,位置:%s" % config.CUT_FILE) with open(config.CUT_FILE, "w", encoding=config.ENCODING_CHARSET) as f: for key, count in word_root_list: f.write("%s,%d\n" % (key, count)) tools.log_end_msg(TITLE) if __name__ == '__main__': main()