| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- # -*- coding:utf-8 -*-
- import config
- import os
- import tools
- import jieba
- import logging
- import logging.config
- from stop_word import load_stop_word
- TITLE = "分词处理"
- # 待处理的数据文件
- DATA_FILE = "E:\Download\怎么长尾词_1655561719.csv"
- def cut_word_and_statistics(data):
- """
- 分词并统计词频
- """
- logging.info("开始执行分词操作并进行词频统计")
- # 分词结果容器
- key_dict = {}
- # 停用词
- stop_word = load_stop_word()
- # 待处理数据总数量
- total_num = len(data)
- logging.info("共需处理 %d 条数据" % total_num)
- for i, item in enumerate(data):
- # 只需要第一列的数据
- longTailKey = item.split(",")[0]
- # 移除换行符
- longTailKey = longTailKey.replace("\n", "")
- # 分词
- cutWord = jieba.cut_for_search(longTailKey)
- # 统计
- for word in cutWord:
- # 过滤停用词
- if word in stop_word:
- continue
- if word in key_dict:
- key_dict[word] = key_dict[word] + 1
- else:
- key_dict[word] = 1
-
- # 进度提示
- tools.tip(total_num, i)
-
- # 根据词频倒序排列
- logging.info("根据词频进行倒序排列")
- sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
- logging.info("分词操作并进行词频统计 结束")
- return sorted_key_list
- def main():
- # 日志初始化
- tools.init_log()
- tools.log_start_msg(TITLE)
- if not os.path.exists(DATA_FILE):
- logging.warning("待处理的数据文件不存在:%s" % DATA_FILE)
- return
- # 读取数据
- logging.info("正在读取待处理的数据文件:%s" % DATA_FILE)
- lines = None
- with open(DATA_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
- lines = f.readlines()
-
- # 执行分词和词频统计(跳过前两行)
- word_root_list = cut_word_and_statistics(lines[2:])
- # 导出数据
- logging.info("正在导出分词数据,位置:%s" % config.CUT_FILE)
- with open(config.CUT_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
- for key, count in word_root_list:
- f.write("%s,%d\n" % (key, count))
- tools.log_end_msg(TITLE)
- if __name__ == '__main__':
- main()
|