zkpk
/
money-mining-python


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							# -*- coding:utf-8 -*-
import math
import os
import pickle
import re
import numpy as np

# 停用词存放文件夹
STOP_WORD_DIR = "./conf/stopwords"

# 临时文件路径
TEMP_PATH = "../tmp"

# 停用词模型
STOP_WORD_CACHE = "stop_word.pkl"

# 正则表达式中需要额外处理的特殊符号
RE_SPECIAL_SYMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]


def save_obj(path, obj):
    """
    保存对象至本地
    """
    with open(path, "wb") as f:
        pickle.dump(obj, f)


def load_obj(path):
    """
    加载对象
    """
    with open(path, "rb") as f:
        return pickle.load(f)


def load_stop_word():
    """
    加载停用词
    """
    # 判断临时文件路径是否存在，不存在则重新创建
    if not os.path.exists(TEMP_PATH):
        os.makedirs(TEMP_PATH)

    # 判断是否存在缓存
    stop_word_cache_path = os.path.join(TEMP_PATH, STOP_WORD_CACHE)
    if os.path.exists(stop_word_cache_path) and os.path.isfile(stop_word_cache_path):
        return load_obj(stop_word_cache_path)

    # 停用词容器
    stop_word = set()

    # 构建停用词列表
    stop_word_files = os.listdir(STOP_WORD_DIR)
    for file in stop_word_files:
        stop_word_file = os.path.join(STOP_WORD_DIR, file)
        with open(stop_word_file, encoding="UTF-8") as f:
            for item in f:
                # 移除换行符
                stop_word.add(item.replace("\n", "").replace("\r", ""))

    # 改成dict提升检索速度
    stop_word_dict = {}
    for item in stop_word:
        stop_word_dict[item] = None

    # 保存本地作为缓存
    save_obj(stop_word_cache_path, stop_word_dict)

    return stop_word_dict


def avg_split_task(total: int, split_internal: int, start=0):
    """
    平分任务，包含开始位置，不包含结束位置，开始位置是从0开始
    :param start: 开始位置
    :param total: 任务总数量
    :param split_internal: 每份数量
    :return: (开始位置，结束位置)
    """

    # 分割的任务份数
    split_num = math.ceil(total / split_internal)

    # 平分
    tasks = []
    for i in range(split_num):
        # 计算平分点在列表中的位置
        start_pos = i * split_internal
        end_pos = i * split_internal + split_internal
        if i == 0:
            start_pos = start
        # 如果超过列表大小需要额外处理
        if end_pos >= total:
            end_pos = -1
        tasks.append([start_pos, end_pos])

    return tasks


def cal_cos_sim(a_word: str, a_stem: list, b_word: str, b_stem: list):
    """
    计算余弦相似性
    :param a_word: A词
    :param a_stem: A词根列表
    :param b_word: B词
    :param b_stem: B词根列表
    :return: 余弦值
    """
    # 合并词根
    union_stem = list(set(a_stem).union(set(b_stem)))

    # 生成词向量
    a_vec, b_vec = [], []
    for word in union_stem:
        if word in RE_SPECIAL_SYMBOL:
            word = "\\" + word
        if word == "c++":
            word = "c\\+\\+"
        a_vec.append(len(re.findall(word, a_word)))
        b_vec.append(len(re.findall(word, b_word)))

    # 计算余弦相关性
    vec1 = np.array(a_vec)
    vec2 = np.array(b_vec)
    val = (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    if val == 0:
        return 0
    return vec1.dot(vec2) / val


def remove_line_break(line: str):
    """
    移除换行符
    :param line: 待处理文本
    :return: 替换后的结果
    """
    if line:
        return line.replace("\r", "").replace("\n", "")
    return line