# -*- coding:utf-8 -*- import math import os import pickle import re import numpy as np # 停用词存放文件夹 STOP_WORD_DIR = "./conf/stopwords" # 临时文件路径 TEMP_PATH = "../tmp" # 停用词模型 STOP_WORD_CACHE = "stop_word.pkl" # 正则表达式中需要额外处理的特殊符号 RE_SPECIAL_SYMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"] def save_obj(path, obj): """ 保存对象至本地 """ with open(path, "wb") as f: pickle.dump(obj, f) def load_obj(path): """ 加载对象 """ with open(path, "rb") as f: return pickle.load(f) def load_stop_word(): """ 加载停用词 """ # 判断临时文件路径是否存在,不存在则重新创建 if not os.path.exists(TEMP_PATH): os.makedirs(TEMP_PATH) # 判断是否存在缓存 stop_word_cache_path = os.path.join(TEMP_PATH, STOP_WORD_CACHE) if os.path.exists(stop_word_cache_path) and os.path.isfile(stop_word_cache_path): return load_obj(stop_word_cache_path) # 停用词容器 stop_word = set() # 构建停用词列表 stop_word_files = os.listdir(STOP_WORD_DIR) for file in stop_word_files: stop_word_file = os.path.join(STOP_WORD_DIR, file) with open(stop_word_file, encoding="UTF-8") as f: for item in f: # 移除换行符 stop_word.add(item.replace("\n", "").replace("\r", "")) # 改成dict提升检索速度 stop_word_dict = {} for item in stop_word: stop_word_dict[item] = None # 保存本地作为缓存 save_obj(stop_word_cache_path, stop_word_dict) return stop_word_dict def avg_split_task(total: int, split_internal: int, start=0): """ 平分任务,包含开始位置,不包含结束位置,开始位置是从0开始 :param start: 开始位置 :param total: 任务总数量 :param split_internal: 每份数量 :return: (开始位置,结束位置) """ # 分割的任务份数 split_num = math.ceil(total / split_internal) # 平分 tasks = [] for i in range(split_num): # 计算平分点在列表中的位置 start_pos = i * split_internal end_pos = i * split_internal + split_internal if i == 0: start_pos = start # 如果超过列表大小需要额外处理 if end_pos >= total: end_pos = -1 tasks.append([start_pos, end_pos]) return tasks def cal_cos_sim(a_word: str, a_stem: list, b_word: str, b_stem: list): """ 计算余弦相似性 :param a_word: A词 :param a_stem: A词根列表 :param b_word: B词 :param b_stem: B词根列表 :return: 余弦值 """ # 合并词根 union_stem = list(set(a_stem).union(set(b_stem))) # 生成词向量 a_vec, b_vec = [], [] for word in union_stem: if word in RE_SPECIAL_SYMBOL: word = "\\" + word if word == "c++": word = "c\\+\\+" a_vec.append(len(re.findall(word, a_word))) b_vec.append(len(re.findall(word, b_word))) # 计算余弦相关性 vec1 = np.array(a_vec) vec2 = np.array(b_vec) val = (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if val == 0: return 0 return vec1.dot(vec2) / val def remove_line_break(line: str): """ 移除换行符 :param line: 待处理文本 :return: 替换后的结果 """ if line: return line.replace("\r", "").replace("\n", "") return line