# -*- coding:utf-8 -*- import config import os import tools import jieba import logging import logging.config from stop_word import load_stop_word # 待处理的数据文件 INPUT_FILE = "E:\Download\怎么长尾词_1655561719.csv" # 处理输出文件 OUTPUT_FILE = "E:\Download\长尾关键词\怎么长尾词_分词统计.csv" def cut_word_and_statistics(data): """ 分词并统计词频 """ logging.info("开始执行分词操作并进行词频统计") # 分词结果容器 key_dict = {} # 停用词 stop_word = load_stop_word() # 待处理数据总数量 total_num = len(data) logging.info("共需处理 %d 条数据" % total_num) for i, item in enumerate(data): # 只需要第一列的数据 longTailKey = item.split(",")[0] # 移除换行符 longTailKey = longTailKey.replace("\n", "") # 分词 cutWord = jieba.cut_for_search(longTailKey) # 统计 for word in cutWord: # 过滤停用词 if word in stop_word: continue if word in key_dict: key_dict[word] = key_dict[word] + 1 else: key_dict[word] = 1 # 进度提示 tools.tip(total_num, i) # 根据词频倒序排列 logging.info("根据词频进行倒序排列") sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True) logging.info("分词操作并进行词频统计 结束") return sorted_key_list def main(orig_file, dest_file): if not os.path.exists(orig_file): logging.warning("待处理的数据文件不存在:%s" % orig_file) return # 读取数据 logging.info("正在读取待处理的数据文件:%s" % orig_file) lines = None with open(orig_file, "r", encoding=config.ENCODING_CHARSET) as f: lines = f.readlines() # 执行分词和词频统计(跳过前两行) word_root_list = cut_word_and_statistics(lines[2:]) # 导出数据 logging.info("正在导出分词数据,位置:%s" % dest_file) with open(dest_file, "w", encoding=config.ENCODING_CHARSET) as f: for key, count in word_root_list: f.write("%s,%d\n" % (key, count)) if __name__ == '__main__': TITLE = "分词处理" # 日志初始化 tools.init_log() tools.log_start_msg(TITLE) main() tools.log_end_msg(TITLE)