| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- # -*- coding:utf-8 -*-
- import os
- import time
- import zipfile
- import jieba
- import utils
- from agg import prepare_word_split_and_reverse_index
- from constant import FILE_LONG_TAIL_MERGE
- # 文件后缀:长尾词.txt
- FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
- def extract_word_from_5118(file_path: str):
- """
- 从5118关键词压缩文件中提取数据
- :param file_path: 待处理文件夹路径
- :return: None
- """
- file_list = []
- for file in os.listdir(file_path):
- file_list.append(os.path.join(file_path, file))
- for i, file in enumerate(file_list):
- zfile = zipfile.ZipFile(file)
- filenames = zfile.namelist()
- for filename in filenames:
- # 重新编码文件名为正确形式
- real_name = filename.encode('cp437').decode('gbk')
- # 排除无效文件
- if real_name in ['打开乱码如何处理?.txt']:
- continue
- # 关键词存放容器
- word_container = set()
- # 读取压缩文件中的文件
- with zfile.open(filename) as file_content:
- lines = file_content.readlines()
- # 跳过开头两行
- for line in lines[2:]:
- split = line.decode("gbk").split(",")
- # 只需要第一列的数据
- word_container.add(split[0])
- output_file_name = real_name[0:real_name.index("--")]
- output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
- with open(output_file_path, "w", encoding="utf-8") as f:
- for item in word_container:
- f.write(item)
- f.write("\n")
- def merge_word(file_path: str):
- """
- 合并长尾词(带去重)
- :param file_path: 待处理文件夹路径
- :return: None
- """
- # 获取文件列表
- file_list = []
- for file in os.listdir(file_path):
- if file.endswith(FILE_SUFFIX_LONG_TAIL):
- file_list.append(os.path.join(file_path, file))
- # 长尾词集合容器
- word_set = set()
- # 读取数据并排重
- for i, file in enumerate(file_list):
- with open(file, "r", encoding="utf-8") as f:
- for word in f:
- word_set.add(word.replace("\n", ""))
- # 保存合并结果
- with open(os.path.join(file_path, FILE_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
- for item in word_set:
- f.write(item)
- f.write("\n")
- def word_split_statistics(file_path: str):
- """
- 分词统计
- :param file_path: 待处理文件夹路径
- :return: None
- """
- file_list = []
- for file in os.listdir(file_path):
- file_list.append(os.path.join(file_path, file))
- stop_word_dict = utils.load_stop_word()
- for i, file in enumerate(file_list):
- if not file.endswith(FILE_SUFFIX_LONG_TAIL):
- continue
- # 分词结果容器
- key_dict = {}
- with open(file, "r", encoding="utf-8") as f:
- for tmp_word in f:
- # 分词
- word_list = jieba.cut_for_search(tmp_word.replace("\n", ""))
- # 统计
- for word in word_list:
- # 过滤停用词
- if word in stop_word_dict:
- continue
- if word in key_dict:
- key_dict[word] = key_dict[word] + 1
- else:
- key_dict[word] = 1
- # 根据词频进行倒序排列
- sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
- output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
- output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
- with open(output_file_path, "w", encoding="UTF-8") as f:
- for key, count in sorted_key_list:
- f.write("%s,%d\n" % (key, count))
- if __name__ == "__main__":
- print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
- # filePath = "../data"
- filePath = "../data/test"
- # extract_word_from_5118(filePath)
- # merge_word(filePath)
- prepare_word_split_and_reverse_index(filePath)
- # agg_word(filePath)
- # word_split_statistics(file_path)
- # tasks = utils.avg_split_task(100, 12, 1)
- # 两者计算余弦值等于:0.8
- # val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
- # ["QQ", "邮箱", "格式", "如何", "写"])
- print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
|