| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- # -*- coding:utf-8 -*-
- import os
- import jieba
- # 待处理的数据文件
- DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv"
- # 输出的结果文件
- CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv"
- # 文件编码格式
- ENCODING_CHARSET = "UTF-8"
- # 停用词
- STOP_WORD_DIR = "./data/stopwords"
- # 间隔进度提示
- INTERNAL_NUM = 50000
- def cut_word_and_statistics(data):
- """
- 分词并统计词频
- """
- print("开始执行分词操作并进行词频统计")
- total_num = len(data)
- print("共需处理数据:%d" % total_num)
- # 分词结果容器
- key_dict = {}
- # 跳过开头两行
- for i, item in enumerate(data):
- # 只需要第一列的数据
- longTailKey = item.split(",")[0]
- longTailKey = longTailKey.replace("\n", "")
-
- # 分词
- cutWord = jieba.cut_for_search(longTailKey)
- # 统计
- for word in cutWord:
- if word in key_dict:
- key_dict[word] = key_dict[word] + 1
- else:
- key_dict[word] = 1
-
- # 进度提示
- if i % INTERNAL_NUM == 0:
- print("当前分词进度 %d / %d" % (i, total_num))
- print("根据词频进行倒序排列")
- # 根据词频倒序排列
- sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
- print("分词结束")
- return sorted_key_dict
- def load_stop_words():
- """
- 加载停用词列表
- """
-
- print("加载停用词 - 开始")
- # 停用词容器
- stop_word = []
- stop_word_files = os.listdir(STOP_WORD_DIR)
- for file in stop_word_files:
- stop_word_file = os.path.join(STOP_WORD_DIR, file)
- with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
- for item in f:
- stop_word.append(item.replace("\n",""))
- print("去重前,停用词数量:", len(stop_word))
- stop_word = list(set(stop_word))
- print("去重后,停用词数量:", len(stop_word))
- print("加载停用词 - 结束")
- return stop_word
- def filter_stop_word(word_root: dict) :
- """
- 对分词结果过滤停用词
- """
- print("过滤停用词 - 开始")
- # 加载停用词
- stop_word = load_stop_words()
-
- print("过滤前,总分词数量:%d" % len(word_root))
- # 过滤停用词
- word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word)
- print("过滤后,总分词数量:%d" % len(word_root_filter))
- print("过滤停用词 - 结束")
- return word_root_filter
- def main():
- print("开始")
- if not os.path.exists(DATA_KEYWORD_FILE):
- raise Exception("待处理的数据文件不存在:%s" % DATA_KEYWORD_FILE)
- # 读取数据
- print("从待处理的数据文件中读取数据")
- lines = None
- with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
- lines = f.readlines()
-
- # 执行分词和词频统计
- word_root = cut_word_and_statistics(lines[:100])
- # 过滤停用词
- word_root_filter = filter_stop_word(word_root)
-
- # 导出过滤后的数据,不要表头和行号
- # print("导出过滤后的结果")
- # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False)
- with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
- for item in word_root_filter.items():
- f.write("%s,%d\n" % item)
- print("结束")
- if __name__ == '__main__':
- main()
|