Browse Source

Merge branch 'feature/agg_toolchains' of zkpk/money-mining-python into master

zkpk 1 year ago
parent
commit
6b6cf3cd0d

+ 1 - 0
.gitignore

@@ -165,3 +165,4 @@ cython_debug/
 tmp/
 !src/tmp
 data/
+*.jar

+ 209 - 35
src/agg.py

@@ -1,65 +1,188 @@
 # -*- coding:utf-8 -*-
 import math
 import os
+import subprocess
+import time
+import zipfile
 from concurrent.futures import ProcessPoolExecutor, as_completed
 
 import jieba
 
 import utils
-from constant import FILE_LONG_TAIL_MERGE
 
-# 文件:长尾词_合并_分词.txt
-FILE_LONG_TAIL_MERGE_SPLIT = "长尾词_合并_分词.txt"
+# 文件后缀_长尾词.txt
+WORD_FILE_SUFFIX = "_长尾词.txt"
 
-# 文件:长尾词_合并_聚合.txt
-FILE_LONG_TAIL_MERGE_AGG = "长尾词_合并_聚合.txt"
+# 压缩文件后缀:_p.zip
+COMPRESS_FILE_SUFFIX = "_p.zip"
+
+# 文件后缀:长尾词_合并.txt
+WORD_FILE = "长尾词.txt"
+
+# 文件:长尾词_分词.txt
+WORD_STEM_FILE = "长尾词_分词.txt"
+
+# 文件:长尾词_倒排索引.txt
+WORD_REVERSE_INDEX_FILE = "长尾词_倒排索引.txt"
+
+# 文件:长尾词_聚合结果_临时.txt
+WORD_AGG_RESULT_TEMP_FILE = "长尾词_聚合结果_临时.txt"
+
+# 文件:长尾词_聚合结果.txt
+WORD_AGG_RESULT_FILE = "长尾词_聚合结果.txt"
 
 # 文件夹:历史聚合数据归档文件夹
-DIR_AGG_FILE_ARCHIVE = "长尾词_聚合_归档_%s"
+WORD_AGG_DIR = "长尾词聚合分析_%s"
 
-# 文件:长尾词_合并_分词倒排索引.txt
-FILE_LONG_TAIL_MERGE_REVERSE_INDEX = "长尾词_合并_倒排索引.txt"
+jieba.setLogLevel(jieba.logging.INFO)
+
+
+def agg_word(path: str):
+    """
+    长尾词聚合
+    :param path: 数据目标路径
+    :return:
+    """
+    # 判断输入路径
+    if not os.path.exists(path):
+        print("输入的目标路径不存在! " + path)
+        return
 
-# 子文件:长尾词_合并_聚合_%s.txt
-FILE_LONG_TAIL_MERGE_AGG_PID = "长尾词_合并_聚合_%s_%s.txt"
+    # 目标路径分析
+    zip_files, txt_files = [], []
+    if os.path.isdir(path):
+        files = os.listdir(path)
+        for file in files:
+            file_path = os.path.join(path, file)
+            if file.endswith(COMPRESS_FILE_SUFFIX):
+                zip_files.append(file_path)
+            elif file.endswith(WORD_FILE_SUFFIX):
+                txt_files.append(file_path)
+    elif path.endswith(COMPRESS_FILE_SUFFIX):
+        zip_files.append(path)
+    elif path.endswith(WORD_FILE_SUFFIX):
+        txt_files.append(path)
+
+    if not zip_files and not txt_files:
+        print("目标路径中不存在任何待分析的文件")
+        return
 
-# 缓存前缀:分词词根
-CACHE_WORD_STEM = "word:stem"
+    # 创建分析结果文件夹路径
+    data_path = os.path.join(path[0:path.rindex("\\") + 1] if os.path.isfile(path) else path,
+                             WORD_AGG_DIR % time.strftime('%Y%m%d%H%M%S'))
+    os.makedirs(data_path)
+    print("创建聚合分析结果文件夹,路径:" + data_path)
+
+    # 调用链开始位置
+    start_pos = 0
+    if not zip_files:
+        start_pos = 1
+
+    chains = [
+        ("5118关键词压缩文件提取数据", extract_word_from_5118, True, zip_files),
+        ("合并长尾词", merge_word, True, txt_files),
+        ("长尾词分词和建立倒排索引", word_split_and_reverse_index, False),
+        ("调用java聚合处理程序", agg_process, False),
+        ("对聚合后的文件内容进行排序重写", sort_file_content, False)
+    ]
+
+    chains = chains[start_pos:]
+    chains_len = len(chains)
+    for i, chain in enumerate(chains, start=1):
+        print("步骤(%s/%s):%s 开始..." % (i, chains_len, chain[0]))
+        is_success = chain[1](data_path, chain[3]) if chain[2] else chain[1](data_path)
+        if not is_success:
+            print("执行异常结束执行!")
+            return
+
+    print("长尾词聚合程序执行完成!")
+
+
+def extract_word_from_5118(data_path: str, zip_files: list):
+    """
+    从5118关键词压缩文件中提取数据
+    :param data_path: 分析结果文件夹路径
+    :param zip_files: 待解压缩列表
+    :return: None
+    """
+    for i, file in enumerate(zip_files):
+        z_file = zipfile.ZipFile(file)
+        filenames = z_file.namelist()
+        for filename in filenames:
+            # 重新编码文件名为正确形式
+            real_name = filename.encode('cp437').decode('gbk')
+
+            # 排除无效文件
+            if real_name in ['打开乱码如何处理?.txt']:
+                continue
 
-# 缓存前缀:倒排索引
-CACHE_WORD_REVERSE_INDEX = "word:reverse_index"
+            # 关键词存放容器
+            word_container = set()
 
-# 缓存:长尾词缓存
-CACHE_WORD = "word"
+            # 读取压缩文件中的文件
+            with z_file.open(filename) as file_content:
+                lines = file_content.readlines()
+                # 跳过开头两行
+                for line in lines[2:]:
+                    split = line.decode("gbk").split(",")
+                    # 只需要第一列的数据
+                    word_container.add(split[0])
 
-# 缓存:聚合位图
-CACHE_UNUSED_BITMAP = "unused_bitmap"
+            output_file_name = real_name[0:real_name.index("--")]
+            output_file_path = os.path.join(data_path, output_file_name + WORD_FILE_SUFFIX)
+            with open(output_file_path, "w", encoding="utf-8") as f:
+                for item in word_container:
+                    f.write(item)
+                    f.write("\n")
 
-# 字符集:UTF-8
-CHARSET_UTF_8 = "UTF-8"
+    return True
 
 
-def prepare_word_split_and_reverse_index(file_path: str):
+def merge_word(data_path: str, txt_files: list):
     """
-    预处理:长尾词分词、建立倒排索引
-    :param file_path: 待处理文件夹路径
+    合并长尾词(带去重)
+    :param data_path: 分析结果文件夹路径
+    :param txt_files: 待合并文件列表
     :return:
     """
+    # 长尾词集合容器
+    word_set = set()
 
+    # 读取数据并排重
+    for i, file in enumerate(txt_files):
+        with open(file, "r", encoding="utf-8") as f:
+            for line in f:
+                word_set.add(utils.remove_line_break(line))
+
+    # 保存合并结果
+    with open(os.path.join(data_path, WORD_FILE), "w", encoding="utf-8") as f:
+        for item in word_set:
+            f.write(item)
+            f.write("\n")
+
+    return True
+
+
+def word_split_and_reverse_index(data_path: str):
+    """
+    预处理:长尾词分词 + 建立倒排索引
+    :param data_path: 数据存放路径
+    :return:
+    """
     # 判断文件是否存在
-    word_input_file = os.path.join(file_path, FILE_LONG_TAIL_MERGE)
-    if os.path.exists(word_input_file) and not os.path.isfile(word_input_file):
-        print("文件不存在! " + word_input_file)
-        return
+    file = os.path.join(data_path, WORD_FILE)
+    if os.path.exists(file) and not os.path.isfile(file):
+        print("文件不存在! " + file)
+        return False
 
     # 总文本数量
     total_line_num = 0
-    with open(word_input_file, "r", encoding="utf-8") as fi:
+    with open(file, "r", encoding="utf-8") as fi:
         total_line_num = sum(1 for line in fi)
 
     if total_line_num == 0:
         print("没有待处理的数据,文本量为0")
-        return
+        return True
 
     # 分割任务数量
     task_list = utils.avg_split_task(total_line_num, math.ceil(total_line_num / os.cpu_count()))
@@ -70,7 +193,7 @@ def prepare_word_split_and_reverse_index(file_path: str):
         p_result_list = []
 
         # 提交任务
-        process_futures = [process_pool.submit(word_split_reverse, word_input_file, task[0], task[1]) for task in
+        process_futures = [process_pool.submit(word_split_reverse_process, file, task[0], task[1]) for task in
                            task_list]
 
         # 处理返回结果
@@ -82,8 +205,8 @@ def prepare_word_split_and_reverse_index(file_path: str):
         # 分词结果排序
         p_result_list = sorted(p_result_list, key=lambda v: v[0])
         # 输出分词结果
-        split_output_file = os.path.join(file_path, FILE_LONG_TAIL_MERGE_SPLIT)
-        with open(split_output_file, "w", encoding="UTF-8") as fo:
+        word_stem_file = os.path.join(data_path, WORD_STEM_FILE)
+        with open(word_stem_file, "w", encoding="UTF-8") as fo:
             for start_pos, word_list, reverse_index in p_result_list:
                 for word in word_list:
                     fo.write("%s\n" % word)
@@ -99,17 +222,19 @@ def prepare_word_split_and_reverse_index(file_path: str):
                 else:
                     word_reverse_index_dict[key] = value
         # 输出倒排索引
-        with open(os.path.join(file_path, FILE_LONG_TAIL_MERGE_REVERSE_INDEX), "w", encoding="UTF-8") as fo:
+        with open(os.path.join(data_path, WORD_REVERSE_INDEX_FILE), "w", encoding="UTF-8") as fo:
             for key, values in word_reverse_index_dict.items():
                 fo.write("%s,%s\n" % (key, ",".join(values)))
 
         # 关闭进程池
         process_pool.shutdown()
 
+    return True
+
 
-def word_split_reverse(input_file: str, start_pos: int, end_pos: int):
+def word_split_reverse_process(input_file: str, start_pos: int, end_pos: int):
     """
-    分词和建立倒排索引
+    长尾词分词和建立倒排索引
     :param input_file: 待处理的文件
     :param start_pos: 处理的开始位置
     :param end_pos: 处理的结束位置
@@ -165,3 +290,52 @@ def word_split_reverse(input_file: str, start_pos: int, end_pos: int):
                 word_list.append(",".join(stem_filter_arr))
 
     return start_pos, word_list, reverse_index
+
+
+def agg_process(data_path: str):
+    """
+    调用java聚合处理程序
+    :param data_path: 分析结果文件夹路径
+    :return: True-运行正常 False-运行失败
+    """
+    cmds = ["java", "-jar", "./resources/money-mining-1.0-jar-with-dependencies.jar", "agg", data_path]
+    return_code = subprocess.run(cmds).returncode
+    return 0 == return_code
+
+
+def sort_file_content(data_path: str):
+    """
+    对聚合后的文件内容进行排序重写
+    :param data_path: 分析结果文件夹路径
+    :return:
+    """
+    # 构造源文件路径
+    src_path = os.path.join(data_path, WORD_AGG_RESULT_TEMP_FILE)
+    if not os.path.exists(src_path) or not os.path.isfile(src_path):
+        print("文件不存在! " + src_path)
+        return False
+
+    result, tmp_result, count = [], [], 0
+    dest_path = os.path.join(data_path, WORD_AGG_RESULT_FILE)
+    with (open(src_path, "r", encoding="UTF-8") as fr,
+          open(dest_path, "w", encoding="UTF-8") as fw):
+        for line in fr.readlines():
+            if line.startswith("\n"):
+                if not tmp_result:
+                    continue
+                else:
+                    result.append((count, tmp_result))
+                    tmp_result = []
+                    count = 0
+            else:
+                count = count + 1
+                tmp_result.append(line)
+
+        result = sorted(result, key=lambda x: x[0], reverse=True)
+
+        for i, tmp_l in result:
+            for l in tmp_l:
+                fw.write(l)
+            fw.write("\n")
+
+    return True

+ 0 - 10
src/constant.py

@@ -1,10 +0,0 @@
-# -*- coding:utf-8 -*-
-
-# 文件后缀:长尾词_合并.txt
-FILE_LONG_TAIL_MERGE = "长尾词_合并.txt"
-
-
-
-
-
-

+ 0 - 28
src/logging.conf

@@ -1,28 +0,0 @@
-[loggers]
-keys=root
-
-[handlers]
-keys=fileHandler,consoleHandler
-
-[formatters]
-keys=simpleFormatter
-
-[logger_root]
-level=DEBUG
-handlers=fileHandler,consoleHandler
-
-[handler_consoleHandler]
-class=StreamHandler
-args=(sys.stdout,)
-level=DEBUG
-formatter=simpleFormatter
-
-[handler_fileHandler]
-class=FileHandler
-args=('../all.log', 'a', "UTF-8")
-level=DEBUG
-formatter=simpleFormatter
-
-[formatter_simpleFormatter]
-format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
-datefmt=

+ 28 - 0
src/mining.py

@@ -0,0 +1,28 @@
+# -*- coding:utf-8 -*-
+import sys
+
+from agg import agg_word, agg_process
+
+
+def main(args: list):
+    """
+    程序入口
+    :param args: 命令参数
+    :return:
+    """
+    if len(args) == 1:
+        print("请输入待运行的程序名")
+        return
+
+    func = args[1]
+    if "agg" == func:
+        if len(args) == 3:
+            agg_word(args[2])
+        else:
+            print("运行长尾词聚合程序,请先输入目标路径")
+    else:
+        print("输入了不知名程序名:%s" % func)
+
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 0 - 182
src/money.py

@@ -1,182 +0,0 @@
-# -*- coding:utf-8 -*-
-import os
-import time
-import zipfile
-
-import jieba
-
-import utils
-from agg import prepare_word_split_and_reverse_index
-from constant import FILE_LONG_TAIL_MERGE
-
-# 文件后缀:长尾词.txt
-FILE_SUFFIX_LONG_TAIL = "_长尾词.txt"
-
-
-def extract_word_from_5118(file_path: str):
-    """
-    从5118关键词压缩文件中提取数据
-    :param file_path: 待处理文件夹路径
-    :return: None
-    """
-    file_list = []
-    for file in os.listdir(file_path):
-        file_list.append(os.path.join(file_path, file))
-
-    for i, file in enumerate(file_list):
-        zfile = zipfile.ZipFile(file)
-        filenames = zfile.namelist()
-        for filename in filenames:
-            # 重新编码文件名为正确形式
-            real_name = filename.encode('cp437').decode('gbk')
-
-            # 排除无效文件
-            if real_name in ['打开乱码如何处理?.txt']:
-                continue
-
-            # 关键词存放容器
-            word_container = set()
-
-            # 读取压缩文件中的文件
-            with zfile.open(filename) as file_content:
-                lines = file_content.readlines()
-                # 跳过开头两行
-                for line in lines[2:]:
-                    split = line.decode("gbk").split(",")
-                    # 只需要第一列的数据
-                    word_container.add(split[0])
-
-            output_file_name = real_name[0:real_name.index("--")]
-            output_file_path = os.path.join(file_path, output_file_name + FILE_SUFFIX_LONG_TAIL)
-            with open(output_file_path, "w", encoding="utf-8") as f:
-                for item in word_container:
-                    f.write(item)
-                    f.write("\n")
-
-
-def merge_word(file_path: str):
-    """
-    合并长尾词(带去重)
-    :param file_path: 待处理文件夹路径
-    :return: None
-    """
-    # 获取文件列表
-    file_list = []
-    for file in os.listdir(file_path):
-        if file.endswith(FILE_SUFFIX_LONG_TAIL):
-            file_list.append(os.path.join(file_path, file))
-
-    # 长尾词集合容器
-    word_set = set()
-
-    # 读取数据并排重
-    for i, file in enumerate(file_list):
-        with open(file, "r", encoding="utf-8") as f:
-            for word in f:
-                word_set.add(word.replace("\n", ""))
-
-    # 保存合并结果
-    with open(os.path.join(file_path, FILE_LONG_TAIL_MERGE), "w", encoding="utf-8") as f:
-        for item in word_set:
-            f.write(item)
-            f.write("\n")
-
-
-def word_split_statistics(file_path: str):
-    """
-    分词统计
-    :param file_path: 待处理文件夹路径
-    :return: None
-    """
-
-    file_list = []
-    for file in os.listdir(file_path):
-        file_list.append(os.path.join(file_path, file))
-
-    stop_word_dict = utils.load_stop_word()
-
-    for i, file in enumerate(file_list):
-        if not file.endswith(FILE_SUFFIX_LONG_TAIL):
-            continue
-
-        # 分词结果容器
-        key_dict = {}
-
-        with open(file, "r", encoding="utf-8") as f:
-            for tmp_word in f:
-                # 分词
-                word_list = jieba.cut_for_search(tmp_word.replace("\n", ""))
-                # 统计
-                for word in word_list:
-                    # 过滤停用词
-                    if word in stop_word_dict:
-                        continue
-
-                    if word in key_dict:
-                        key_dict[word] = key_dict[word] + 1
-                    else:
-                        key_dict[word] = 1
-
-        # 根据词频进行倒序排列
-        sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
-
-        output_file_name = file[file.rindex("\\") + 1:file.index(FILE_SUFFIX_LONG_TAIL)]
-        output_file_path = os.path.join(file_path, output_file_name + "_长尾词_分词统计.csv")
-        with open(output_file_path, "w", encoding="UTF-8") as f:
-            for key, count in sorted_key_list:
-                f.write("%s,%d\n" % (key, count))
-
-
-def sort_file_content(file_path: str):
-    """
-    对聚合后的文件内容进行排序重写
-    :param file_path:
-    :return:
-    """
-
-    target_path = os.path.join(file_path, "长尾词_合并_聚合.txt")
-    if os.path.exists(target_path) and not os.path.isfile(target_path):
-        print("文件不存在! " + target_path)
-        return
-
-    result = []
-    tmp_result = []
-    count = 0;
-    with (open(target_path, "r", encoding="UTF-8") as fr,
-        open(target_path.replace(".txt", "排序.txt"), "w", encoding="UTF-8") as fw):
-        for line in fr.readlines():
-            if line.startswith("\n"):
-                if not tmp_result:
-                    continue
-                else:
-                    result.append((count, tmp_result))
-                    tmp_result = []
-                    count = 0
-            else:
-                count = count + 1
-                tmp_result.append(line)
-
-        result = sorted(result, key=lambda x: x[0], reverse=True)
-
-        for i, tmp_l in result:
-            for l in tmp_l:
-                fw.write(l)
-            fw.write("\n")
-
-
-
-if __name__ == "__main__":
-    print("开始时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
-    # filePath = "../data"
-    filePath = "../data/test"
-    # extract_word_from_5118(filePath)
-    # merge_word(filePath)
-    # prepare_word_split_and_reverse_index(filePath)
-    # agg_word(filePath)
-    # word_split_statistics(file_path)
-    # tasks = utils.avg_split_task(100, 12, 1)
-    # 两者计算余弦值等于:0.8
-    # val = utils.cal_cos_sim("QQ邮箱格式怎么写", ["QQ", "邮箱", "格式", "怎么", "写"], "QQ邮箱格式如何写",
-    #                         ["QQ", "邮箱", "格式", "如何", "写"])
-    sort_file_content(filePath)
-    print("结束时间" + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

+ 0 - 0
src/conf/stopwords/baidu_stopwords.txt → src/resources/stopwords/baidu_stopwords.txt


+ 0 - 0
src/conf/stopwords/cn_stopwords.txt → src/resources/stopwords/cn_stopwords.txt


+ 0 - 0
src/conf/stopwords/hit_stopwords.txt → src/resources/stopwords/hit_stopwords.txt


+ 0 - 0
src/conf/stopwords/scu_stopwords.txt → src/resources/stopwords/scu_stopwords.txt


+ 0 - 0
src/conf/stopwords/停用词.txt → src/resources/stopwords/停用词.txt


+ 0 - 0
src/tmp/__init__.py


+ 0 - 279
src/tmp/agg_word.py

@@ -1,279 +0,0 @@
-# -*- coding:utf-8 -*-
-
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from functools import reduce
-from itertools import combinations
-import math
-import mmap
-import os
-from time import time
-from cal import cal_cos_sim
-
-import config
-import tools
-import re
-import logging
-
-# 问题
-# 用线程处理IO高的部分
-# 主线程利用率极低
-# 优化代码,加快速度(目前速度:约1分钟100个关键词)
-
-# 已解决
-# 输出的格式不正确
-# 分析结果内容没有写入结果中
-# 移除祠根数等于1的词,不做分析
-# 减少重复加载 -> 解决:加入仅在子进程时才加载的判断
-
-tools.init_log()    
-
-def intesect(x, y):
-    """
-    计算集合的交集
-    """
-    return x & y
-
-if __name__ != "__main__":
-    # 停用词
-    stop_word_index = tools.load_stop_word()
-
-    # KEY表索引
-    key_index = tools.load_obj(config.KEY_INDEX_CACHE)
-
-    # 倒排表索引
-    reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
-
-    # 聚合阈值
-    agg_threshold = 0.8
-
-    # 正则提取
-    # 倒排表 索引
-    index_re = r"'(\d+)'"
-    index_pattern = re.compile(index_re, re.I)
-    # 关键词    
-    key_re = r"[^,]*,(.*),\["
-    key_pattern = re.compile(key_re, re.I)
-    # KEY表 词根
-    stem_re = r"'([^,]*)'"
-    stem_pattern = re.compile(stem_re, re.I)
-
-def sub_process(start_pos, end_pos):
-    """
-    子进程
-    """
-    pid = os.getpid()
-
-    logging.info("子进程-%d 开始执行任务,开始位置:%d,结束位置:%d" % (pid,start_pos, end_pos))
-
-    # 聚合结果
-    agg_result = []
-    
-    # 开始时间
-    start_time = time()
-
-    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as f_key, \
-        mmap.mmap(f_key.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap, \
-        open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as f_reverse, \
-        mmap.mmap(f_reverse.fileno(), 0, access=mmap.ACCESS_READ) as f_reverse_mmap :
-        
-        # 把关键词索引转换成对应的位置
-        lower_pos = key_index[start_pos]
-        upper_pos = key_index[end_pos]
-
-        # 移动到开始位置
-        f_key_mmap.seek(lower_pos)
-
-        # 读取主关键词信息
-        a_keys = {}
-        while True:
-            # 校验当前位置是否越界
-            cur_pos = f_key_mmap.tell()
-            if cur_pos >= upper_pos:
-                break
-            
-            line = f_key_mmap.readline().decode("UTF-8")
-            # 提取 关键词、词根
-            key_m = key_pattern.match(line)
-            a_key = key_m.group(1)
-            a_stem = []
-            # 过滤停用词
-            tmp_stem = stem_pattern.findall(line)
-            for stem in tmp_stem:
-                if stem in stop_word_index:
-                    continue
-                a_stem.append(stem)
-            # 保存到容器,如果祠根数等于1则没有比较的价值
-            if len(a_stem) > 1:
-                a_keys[a_key]=a_stem
-
-        # 合并词根
-        all_stem = set()
-        for a_stem in a_keys.values():
-            for stem in a_stem:
-                all_stem.add(stem)
-
-        # 获取倒排信息
-        reverse_dict = {}
-        for stem in all_stem:
-            # 读取倒排表
-            f_reverse_mmap.seek(reverse_index[stem])
-            reverse_line = f_reverse_mmap.readline().decode("UTF-8")
-            # 提取 位置信息
-            b_indexs = index_pattern.findall(reverse_line)
-            reverse_dict[stem]=set(b_indexs)
-        
-        # 计算相关性
-        for a_key, a_stem in a_keys.items():
-            # 计算词根组合
-            logging.debug("子进程-%d 主关键词:%s 开始计算词根组合" % (pid, a_key))
-            tmp_stem = []
-            for stem in a_stem:
-                tmp_stem.append(stem)
-            num = math.ceil(len(tmp_stem) * 0.7)
-            stem_combs = list (combinations(tmp_stem, num))
-            logging.debug("子进程-%d 主关键词:%s 计算词根组合结束" % (pid, a_key))
-
-            logging.debug("子进程-%d 主关键词:%s 开始获取词根涉及的关键词信息" % (pid, a_key))
-            # 计算词根涉及的关键词的交集
-            b_indexs = set()
-            for stem_comb in stem_combs:
-                indexs = [reverse_dict[a_stem] for a_stem in stem_comb]
-                for b_index in  reduce(intesect, indexs):
-                    b_indexs.add(b_index)
-            logging.debug("子进程-%d 主关键词:%s 总祠根数:%d" % (pid, a_key, len(b_indexs)))
-            # 获取关键词信息
-            b_keys = []
-            for b_index in b_indexs:
-                # 读取关键词数据
-                f_key_mmap.seek(key_index[int(b_index)])
-                line = f_key_mmap.readline().decode("UTF-8")
-                # 提取 关键词、词根
-                key_m = key_pattern.match(line)
-                b_key = key_m.group(1)
-                b_stem = stem_pattern.findall(line)
-                b_keys.append((b_key, b_stem))
-            logging.debug("子进程-%d 主关键词:%s 获取词根涉及的关键词信息结束,涉及计算关键词数量:%d" % (pid, a_key, len(b_keys)))
-
-            logging.debug("子进程-%d 主关键词:%s 开始计算相关性" % (pid, a_key))
-            # 结果容器
-            correlation_key = []
-            correlation_key.append(a_key)
-            # 计算相关性
-            if b_keys:
-                for b_key, b_stem in b_keys:
-                    try:
-                        val = cal_cos_sim(a_key, a_stem, b_key, b_stem)
-                        if val >= agg_threshold:
-                            correlation_key.append(b_key)
-                    except Exception as e:
-                        logging.error("主关键词:%s 发生异常,涉及的副关键词信息-关键词:%s,分词:%s" % (a_key, b_key, b_stem), e)
-
-                # 有内容则进行保存
-                if len(correlation_key) > 1:
-                    agg_result.append(correlation_key)
-            logging.debug("子进程-%d 主关键词:%s 计算相关性结束,相关的关键词数据量:%d" % (pid, a_key, (len(correlation_key)-1)))
-
-            
-    logging.info("子进程-%d 执行任务结束,耗时:%f" % (pid, (time() - start_time)))
-
-    return {
-        "agg_result": agg_result,
-        "start_pos": start_pos
-    }
-            
-def main_process():
-    """
-    主进程
-    """
-
-    # 进程数
-    process_num = 4
-
-    # KEY 表总长度
-    total_task = 14500028
-
-    # 任务数量
-    per_task_num = 100
-
-    # 划分子任务:任务进度记录、任务列表
-    process_record, tasks = avg_split_task(total_task, per_task_num)
-
-    with ProcessPoolExecutor(max_workers=process_num) as process_pool, \
-        open(config.AGG_FILE, "a", encoding=config.ENCODING_CHARSET) as f:
-
-        logging.info("主进程:提交任务到子进程")
-        process_futures = [process_pool.submit(sub_process, task[0], task[1]) for task in tasks]
-        
-        for p_future in as_completed(process_futures):
-            logging.debug("主进程:子进程返回部分数据")
-            result = p_future.result()
-
-            # 记录处理进度
-            cur_pos = result["start_pos"]
-            process_record[cur_pos//per_task_num]=1
-
-            # 保存分析结果
-            if result:
-                logging.debug("主进程:存在有效数据开始处理")
-                for correlation_key in result["agg_result"]:
-                    f.write("\n######开始######\n")
-                    for key in correlation_key:
-                        f.write("%s\n" % key)
-            
-            # 保存处理进度
-            tools.save_obj(config.ANALYSE_PROCESS_CACHE, process_record)
-
-            tools.tip(total_task, cur_pos)
-            
-
-                
-
-def avg_split_task(total:int, split_internal:int):
-    """
-    平分任务
-    """
-    # 任务列表
-    tasks = None
-    # 任务进度记录
-    process_record = None
-
-    # 分割的任务份数
-    split_num = math.ceil(total / split_internal)
-
-    # 平分
-    tmp_lists = []
-    for i in range(split_num):
-        # 计算平分点在列表中的位置
-        start_pos = i * split_internal
-        end_pos = i * split_internal + split_internal
-        # 如果超过列表大小需要额外处理
-        if end_pos >= total:
-            end_pos = None
-        tmp_lists.append([start_pos,end_pos])
-    
-    # 加载进度缓存
-    if os.path.exists(config.ANALYSE_PROCESS_CACHE):
-        logging.debug("存在分析进度缓存")
-        process_record = tools.load_obj(config.ANALYSE_PROCESS_CACHE)
-    
-    # 更新任务列表
-    if process_record:
-        tasks = []
-        for task in tmp_lists:
-            pos = task[0] // split_internal
-            if not process_record[pos]:
-                tasks.append(task)
-    else:
-        tasks = tmp_lists
-        process_record = [0 for i in range(len(tmp_lists))]
-
-    return process_record, tasks
-
-if __name__ == "__main__":
-
-    TITLE = "(多进程版 fast_14.py)聚合文件"
-    tools.log_start_msg(TITLE)
-
-    main_process()
-
-    tools.log_end_msg(TITLE)

+ 0 - 139
src/tmp/analyse.py

@@ -1,139 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import re
-import mmap
-import tools
-import jieba
-
-def transfer_str(num):
-    msg = None
-    if num >= 10000:
-        msg = "%d万%d" % (num//10000, num%10000)
-    else:
-        msg = str(num)
-    return msg
-
-def cal(list):
-    list_len = len(list)
-    list_count = sum(list)
-    sum_msg = transfer_str(list_len)
-    count_msg = transfer_str(list_count)
-    avg_msg = transfer_str(int(list_count/list_len))
-    return sum_msg, count_msg, avg_msg
-
-def tip(condition, list):
-    print("条件:%s - 涉及:%s个词根,涉及词数:%s,平均约:%s 词数/词根" % ((condition,)+ cal(list)))
-
-def keyStat(fmap: mmap.mmap, keyword:str):
-    fmap.seek(0)
-    pattern = re.compile(keyword)
-    stopWord = tools.load_stop_word()
-    totalSize = fmap.size()
-    
-    statDict = {}
-    while True:
-        curPos = fmap.tell();
-        if curPos >= totalSize:
-            break
-
-        lineContent = f_mmap.readline().decode("UTF-8")
-        tmpList = pattern.findall(lineContent)
-        if tmpList:
-            cutList = list(jieba.cut_for_search(lineContent.replace("\r","").replace("\n","")))
-            for cutKeyword in cutList:
-                if cutKeyword in stopWord:
-                    continue
-
-                count = statDict.get(cutKeyword)
-                if count:
-                    statDict[cutKeyword]=count+1
-                else:
-                    statDict[cutKeyword]=1
-    
-    sorted_key_list = sorted(statDict.items(), key=lambda x: x[1], reverse=True)
-
-    print("与关键词:%s 相关的词共计:%d" % (keyword, len(sorted_key_list)))
-
-    count_list = [ele for ele in statDict.values()]
-
-    tip("等于1", [val for val in count_list if val == 1])
-
-    tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
-
-    tip("大于等于100小于200", [val for val in count_list if val >= 100 and val < 200])
-
-    tip("大于等于200小于300", [val for val in count_list if val >= 200 and val < 300])
-
-    tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
-
-    tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
-        
-    tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
-
-    tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
-
-    tip("大于等于5000小于1万", [val for val in count_list if val >= 5000 and val < 10000])
-
-    tip("大于等于1万小于5万", [val for val in count_list if val >= 10000 and val < 50000])
-
-    tip("大于等于5万小于10万", [val for val in count_list if val >= 50000 and val < 100000])
-
-    tip("大于等于10万", [val for val in count_list if val >= 100000])
-
-    with open("./data/test/stat_%s.csv" % keyword, "w", encoding="UTF-8") as fw:
-        for key, count in sorted_key_list:
-            if count > 1:
-                fw.write("%s,%d\n" % (key, count))
-
-def keyFilter(fmap: mmap.mmap, keyword:str):
-    fmap.seek(0)
-    pattern = re.compile(keyword)
-    
-    totalSize = fmap.size()
-    
-    with open("./data/test/filter_%s.csv" % keyword, "w", encoding="UTF-8") as fw:
-        while True:
-            curPos = fmap.tell();
-            if curPos >= totalSize:
-                break
-
-            lineContent = f_mmap.readline().decode("UTF-8")
-            tmpList = pattern.findall(lineContent)
-            if tmpList:
-                fw.write("%s\n"%lineContent.replace("\r","").replace("\n",""))
-            
-
-def countKeyword(fmap: mmap.mmap, keywords:set):
-    for keyword in keywords:
-        f_mmap.seek(0)
-        pattern = re.compile(keyword)
-    
-        count=0
-    
-        while True:
-            lineContent = f_mmap.readline().decode("UTF-8")
-            if not lineContent:
-                break
-            
-            tmpList = pattern.findall(lineContent)
-            if tmpList:
-                count += 1
-        
-        print("关键词:%s,共出现次数:%d" % (keyword, count))
-
-
-INPUT_FILE = "./data/tmp/merge.csv"
-
-with open(INPUT_FILE, "r", encoding="UTF-8") as f, \
-    mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as f_mmap:
-
-    filterSet = set();
-    with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
-        while True:
-            lineContent = f_filter.readline().replace("\n","").replace("\r","")
-            if not lineContent:
-                break
-            
-            filterSet.add(lineContent)
-    
-    countKeyword(f_mmap, filterSet)

+ 0 - 43
src/tmp/cal.py

@@ -1,43 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import config
-import re
-import numpy as np
-
-def merge_stem(a_stem:list, b_stem:list):
-    """
-    合并词根
-    """
-    return list(set(a_stem).union(set(b_stem)))
-
-def gen_word_vec(a_word:str, b_word:str, stem:list):
-    """
-    生成词向量
-    """
-    a_vec, b_vec = [], []
-    for word in stem:
-        # if re.findall(word, config.RE_SPECIAL_SIMBOL):
-        if word in config.RE_SPECIAL_SIMBOL:
-            word = "\\" + word
-        if word == "c++":
-            word = "c\\+\\+"
-        a_vec.append(len(re.findall(word, a_word)))
-        b_vec.append(len(re.findall(word, b_word)))
-    return a_vec, b_vec
-
-def col_sim(vec1, vec2):
-    """
-    计算余弦相似性
-    """
-    return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
-
-
-def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list):
-    """
-    计算余弦相似性
-    """
-    union_stem = merge_stem(a_stem, b_stem)
-    a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem)
-    val = col_sim(np.array(a_vec), np.array(b_vec))
-    return val
-

+ 0 - 61
src/tmp/config.py

@@ -1,61 +0,0 @@
-# -*- coding:utf-8 -*-
-
-# 文件编码格式
-ENCODING_CHARSET = "UTF-8"
-
-# 分词与词频统计
-CUT_FILE = "./data/tmp/cut.csv"
-
-# 拓展词合并文件
-MERGE_FILE = "./data/tmp/merge.csv"
-
-# 排除合并的文件
-MERGE_EXCLUDE_FILES = ['打开乱码如何处理?.txt']
-
-# 关键词文件(包含三要素:序号、关键词、词根)
-KEY_FILE = "./data/tmp/key.csv"
-
-# 关键词索引文件(包含两个要素:关键词序号、在文件中的位置)(暂时弃用)
-KEY_INDEX_FILE = "./data/tmp/key_index.csv"
-
-# 关键词索引模型 缓存 (包含两个要素:关键词序号、在文件中的位置)
-KEY_INDEX_CACHE = "./data/cache/key_index.pkl"
-
-# 关键词倒排文件(包含两个要素:词根、关键词序号)
-KEY_REVERSE_FILE = "./data/tmp/key_reverse.csv"
-
-# 关键词倒排索引模型 缓存 (包含两个要素:词根、位置)
-KEY_REVERSE_INDEX_CACHE = "./data/cache/key_reverse_index.pkl"
-
-# 关键词倒排文件 数据统计 (包含两个要素:词根,涉及的关键词数量)
-KEY_REVERSE_STATISTICS_FILE = "./data/tmp/key_reverse_statistics.csv"
-
-# 关键词倒排索引热点 缓存 (包含两个要素:词根、位置)
-KEY_REVERSE_INDEX_HOT_CACHE = "./data/cache/key_reverse_index_hot.pkl"
-
-# 最终的聚合分析结果存放文件
-AGG_ANALYSE_FILE = "./data/analyse/%s.csv"
-
-# 聚合结果
-AGG_FILE = "./data/agg_analyse.csv"
-
-# 停用词存放文件夹
-STOP_WORD_DIR = "../conf/stopwords"
-
-# 停用词模型 缓存
-STOP_WORD_CACHE = "./data/cache/stop_word.pkl"
-
-# 分析BITMAP模型 缓存
-ANALYSE_BITMAP_CACHE = "./data/cache/analyse_bitmap.pkl"
-
-# 分析进度模型 缓存
-ANALYSE_PROCESS_CACHE = "./data/cache/analyse_process.pkl"
-
-# 正则表达式中需要额外处理的特殊符号
-RE_SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
-
-# 百分比进度提示
-PRECENT_TIPS = 0.01
-
-# 正则提取关键词表中的信息
-KEY_RE_PATTERAN = r"(\d+),([^,]*),(.*)"

+ 0 - 101
src/tmp/cut.py

@@ -1,101 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import config
-import os
-import tools
-import jieba
-import logging
-import logging.config
-
-
-# 待处理的数据文件
-INPUT_FILE = "E:\Download\怎么长尾词_1655561719.csv"
-
-# 处理输出文件
-OUTPUT_FILE = "E:\Download\长尾关键词\怎么长尾词_分词统计.csv"
-
-def cut_word_and_statistics(data):
-
-    """
-    分词并统计词频
-    """
-
-    logging.info("开始执行分词操作并进行词频统计")
-
-    # 分词结果容器
-    key_dict = {}
-    # 停用词
-    stop_word = tools.load_stop_word()
-    # 待处理数据总数量
-    total_num = len(data)
-
-    logging.info("共需处理 %d 条数据" % total_num)
-
-    for i, item in enumerate(data):
-        # 只需要第一列的数据
-        longTailKey = item.split(",")[0]
-        # 移除换行符
-        longTailKey = longTailKey.replace("\n", "")
-        # 分词
-        cutWord = jieba.cut_for_search(longTailKey)
-
-        # 统计
-        for word in cutWord:
-
-            # 过滤停用词
-            if word in stop_word:
-                continue
-
-            if word in key_dict:
-                key_dict[word] = key_dict[word] + 1
-            else:
-                key_dict[word] = 1
-        
-        # 进度提示
-        tools.tip(total_num, i)
-        
-
-    # 根据词频倒序排列
-    logging.info("根据词频进行倒序排列")
-    sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
-
-    logging.info("分词操作并进行词频统计 结束")
-
-    return sorted_key_list
-
-def main(orig_file, dest_file):
-
-    if not os.path.exists(orig_file):
-        logging.warning("待处理的数据文件不存在:%s" % orig_file)
-        return
-
-    # 读取数据
-    logging.info("正在读取待处理的数据文件:%s" % orig_file)
-    lines = None
-    with open(orig_file, "r", encoding=config.ENCODING_CHARSET) as f:
-        lines = f.readlines()
-    
-    # 执行分词和词频统计(跳过前两行)
-    word_root_list = cut_word_and_statistics(lines[2:])
-
-    # 导出数据
-    logging.info("正在导出分词数据,位置:%s" % dest_file)
-    with open(dest_file, "w", encoding=config.ENCODING_CHARSET) as f:
-        for key, count in word_root_list:
-            f.write("%s,%d\n" % (key, count))
-
-    
-
-
-if __name__ == '__main__':
-    TITLE = "分词处理"
-
-    # 日志初始化
-    tools.init_log()
-
-    tools.log_start_msg(TITLE)
-
-    main()
-
-    tools.log_end_msg(TITLE)
-

+ 0 - 135
src/tmp/filter.py

@@ -1,135 +0,0 @@
-# -*-: coding:utf-8 -*-
-
-import csv
-import re
-
-def filter3():
-    INPUT_DATA = r"./data/agg_filter3.csv"
-    OUTPUT_TEMP = "./data/agg_filter4.csv"
-
-    startPattern = re.compile("######开始######")
-    keyPattern = re.compile("赚钱")
-
-    total = []
-    sub = None
-    with open(INPUT_DATA, "r", encoding="GBK") as fr,\
-        open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
-
-        for line in fr.readlines():
-            
-            tl = startPattern.findall(line)
-            if len(tl) > 0:
-                sub = []
-                sub.append(line)
-                total.append(sub)
-            elif line.startswith("\n"):
-                continue
-            else:
-                kl = keyPattern.findall(line)
-                if len(kl)>0:
-                    sub.append(line)
-    
-        sortedList = sorted(total, key=lambda x:len(x), reverse=True)
-
-        fw.write("统计信息")
-        fw.write("%s%d\n" % ("总数:", len(sortedList)))
-        fw.write("%s%d\n" %("大于等于1000:", len([subList for subList in sortedList if len(subList)>=1000])))
-        fw.write("%s%d\n" %("大于等于500小于1000:", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
-        fw.write("%s%d\n" %("大于等于100小于500:", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
-        fw.write("%s%d\n" %("大于等于50小于100:", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
-        fw.write("%s%d\n" %("大于等于10小于50:", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
-        fw.write("%s%d\n" %("大于等于5小于10:", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
-        fw.write("%s%d\n" %("大于等于3小于5:", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
-        fw.write("%s%d\n" %("等于2:", len([subList for subList in sortedList if len(subList)==2])))
-        fw.write("%s%d\n" %("等于1:", len([subList for subList in sortedList if len(subList)==1])))
-       
-        for subList in sortedList:
-            if len(subList) == 1:
-                continue
-
-            fw.write("\n")
-            for line in subList:
-                fw.write(line)
-
-def filter2():
-    INPUT_DATA = r"./data/agg_filter.csv"
-    OUTPUT_TEMP = "./data/agg_filter3.csv"
-
-    startPattern = re.compile("######开始######")
-
-    total = []
-    sub = None
-    with open(INPUT_DATA, "r", encoding="GBK") as fr,\
-        open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
-
-        for line in fr.readlines():
-            
-            tl = startPattern.findall(line)
-            if len(tl) > 0:
-                sub = []
-                sub.append(line)
-                total.append(sub)
-            elif line.startswith("\n"):
-                continue
-            else:
-                sub.append(line)
-    
-        sortedList = sorted(total, key=lambda x:len(x), reverse=True)
-
-        fw.write("统计信息")
-        fw.write("%s%d\n" % ("总数:", len(sortedList)))
-        fw.write("%s%d\n" %("大于等于1000:", len([subList for subList in sortedList if len(subList)>=1000])))
-        fw.write("%s%d\n" %("大于等于500小于1000:", len([subList for subList in sortedList if len(subList)>=500 and len(subList) < 1000])))
-        fw.write("%s%d\n" %("大于等于100小于500:", len([subList for subList in sortedList if len(subList)>=100 and len(subList) < 500])))
-        fw.write("%s%d\n" %("大于等于50小于100:", len([subList for subList in sortedList if len(subList)>=50 and len(subList)<100])))
-        fw.write("%s%d\n" %("大于等于10小于50:", len([subList for subList in sortedList if len(subList)>=10 and len(subList)<50])))
-        fw.write("%s%d\n" %("大于等于5小于10:", len([subList for subList in sortedList if len(subList)>=5 and len(subList)<10])))
-        fw.write("%s%d\n" %("大于等于3小于5:", len([subList for subList in sortedList if len(subList)>=3 and len(subList)<5])))
-        fw.write("%s%d\n" %("等于2:", len([subList for subList in sortedList if len(subList)==2])))
-        fw.write("%s%d\n" %("等于1:", len([subList for subList in sortedList if len(subList)==1])))
-
-        for subList in sortedList:
-            if len(subList) == 1:
-                continue
-
-            fw.write("\n")
-            for line in subList:
-                fw.write(line)
-
-        
-
-def filter1():
-    # INPUT_DATA = r"E:\Documents\Code\LongTailKeyDataMining\agg.csv"
-    INPUT_DATA = r"./data/agg_filter.csv"
-    OUTPUT_TEMP = "./data/agg_filter2.csv"
-
-    filterPattern = []
-    with open("./data/过滤名单.txt", "r", encoding="UTF-8") as f_filter:
-        filterSet = set();
-        while True:
-            lineContent = f_filter.readline().replace("\n","").replace("\r","")
-            if not lineContent:
-                break
-                
-            filterSet.add(lineContent)
-
-        for r in filterSet:
-            filterPattern.append(re.compile(r))
-
-    with open(INPUT_DATA, "r", encoding="GBK") as fr,\
-        open(OUTPUT_TEMP, "w", encoding="GBK") as fw:
-
-        for line in fr.readlines():
-            writeFlag = True
-            for p in filterPattern:
-                l = p.findall(line)
-                if len(l) > 0:
-                    writeFlag = False
-                    break
-            
-            if writeFlag:
-                fw.write(line)
-
-if __name__ == '__main__':
-    filter3()
-

+ 0 - 330
src/tmp/filter/DataFilter.py

@@ -1,330 +0,0 @@
-# -*- coding: utf-8 -*-
-
-################################################################################
-## Form generated from reading UI file 'DataFilter.ui'
-##
-## Created by: Qt User Interface Compiler version 6.5.1
-##
-## WARNING! All changes made in this file will be lost when recompiling UI file!
-################################################################################
-
-from PySide6.QtCore import (QCoreApplication, QDate, QDateTime, QLocale,
-    QMetaObject, QObject, QPoint, QRect,
-    QSize, QTime, QUrl, Qt)
-from PySide6.QtGui import (QBrush, QColor, QConicalGradient, QCursor,
-    QFont, QFontDatabase, QGradient, QIcon,
-    QImage, QKeySequence, QLinearGradient, QPainter,
-    QPalette, QPixmap, QRadialGradient, QTransform)
-from PySide6.QtWidgets import (QApplication, QHBoxLayout, QLabel, QLayout,
-    QLineEdit, QPushButton, QSizePolicy, QTextBrowser,
-    QTextEdit, QVBoxLayout, QWidget)
-
-class Ui_Form(object):
-    def setupUi(self, Form):
-        if not Form.objectName():
-            Form.setObjectName(u"Form")
-        Form.setWindowModality(Qt.NonModal)
-        Form.setEnabled(True)
-        Form.resize(1546, 514)
-        Form.setMinimumSize(QSize(800, 400))
-        self.verticalLayout_2 = QVBoxLayout(Form)
-        self.verticalLayout_2.setObjectName(u"verticalLayout_2")
-        self.horizontalLayout_2 = QHBoxLayout()
-        self.horizontalLayout_2.setObjectName(u"horizontalLayout_2")
-        self.filePathBox = QLineEdit(Form)
-        self.filePathBox.setObjectName(u"filePathBox")
-        self.filePathBox.setEnabled(False)
-
-        self.horizontalLayout_2.addWidget(self.filePathBox)
-
-        self.fileBtn = QPushButton(Form)
-        self.fileBtn.setObjectName(u"fileBtn")
-
-        self.horizontalLayout_2.addWidget(self.fileBtn)
-
-
-        self.verticalLayout_2.addLayout(self.horizontalLayout_2)
-
-        self.horizontalLayout_6 = QHBoxLayout()
-        self.horizontalLayout_6.setObjectName(u"horizontalLayout_6")
-        self.horizontalLayout_6.setSizeConstraint(QLayout.SetDefaultConstraint)
-        self.verticalLayout = QVBoxLayout()
-        self.verticalLayout.setObjectName(u"verticalLayout")
-        self.firstKeyBox = QLineEdit(Form)
-        self.firstKeyBox.setObjectName(u"firstKeyBox")
-
-        self.verticalLayout.addWidget(self.firstKeyBox)
-
-        self.horizontalLayout = QHBoxLayout()
-        self.horizontalLayout.setObjectName(u"horizontalLayout")
-        self.firstDigitBtn = QPushButton(Form)
-        self.firstDigitBtn.setObjectName(u"firstDigitBtn")
-
-        self.horizontalLayout.addWidget(self.firstDigitBtn)
-
-        self.firstCategoryBtn = QPushButton(Form)
-        self.firstCategoryBtn.setObjectName(u"firstCategoryBtn")
-
-        self.horizontalLayout.addWidget(self.firstCategoryBtn)
-
-        self.firstCharacterBtn = QPushButton(Form)
-        self.firstCharacterBtn.setObjectName(u"firstCharacterBtn")
-
-        self.horizontalLayout.addWidget(self.firstCharacterBtn)
-
-        self.firstFilterBtn = QPushButton(Form)
-        self.firstFilterBtn.setObjectName(u"firstFilterBtn")
-
-        self.horizontalLayout.addWidget(self.firstFilterBtn)
-
-
-        self.verticalLayout.addLayout(self.horizontalLayout)
-
-        self.label_2 = QLabel(Form)
-        self.label_2.setObjectName(u"label_2")
-
-        self.verticalLayout.addWidget(self.label_2)
-
-        self.firstCategoryBox = QTextEdit(Form)
-        self.firstCategoryBox.setObjectName(u"firstCategoryBox")
-        sizePolicy = QSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding)
-        sizePolicy.setHorizontalStretch(0)
-        sizePolicy.setVerticalStretch(0)
-        sizePolicy.setHeightForWidth(self.firstCategoryBox.sizePolicy().hasHeightForWidth())
-        self.firstCategoryBox.setSizePolicy(sizePolicy)
-
-        self.verticalLayout.addWidget(self.firstCategoryBox)
-
-        self.result_label_1 = QLabel(Form)
-        self.result_label_1.setObjectName(u"result_label_1")
-
-        self.verticalLayout.addWidget(self.result_label_1)
-
-        self.firstResultBox = QTextBrowser(Form)
-        self.firstResultBox.setObjectName(u"firstResultBox")
-        sizePolicy.setHeightForWidth(self.firstResultBox.sizePolicy().hasHeightForWidth())
-        self.firstResultBox.setSizePolicy(sizePolicy)
-
-        self.verticalLayout.addWidget(self.firstResultBox)
-
-        self.verticalLayout.setStretch(3, 1)
-        self.verticalLayout.setStretch(5, 5)
-
-        self.horizontalLayout_6.addLayout(self.verticalLayout)
-
-        self.verticalLayout_4 = QVBoxLayout()
-        self.verticalLayout_4.setObjectName(u"verticalLayout_4")
-        self.secondKeyBox = QLineEdit(Form)
-        self.secondKeyBox.setObjectName(u"secondKeyBox")
-
-        self.verticalLayout_4.addWidget(self.secondKeyBox)
-
-        self.horizontalLayout_3 = QHBoxLayout()
-        self.horizontalLayout_3.setObjectName(u"horizontalLayout_3")
-        self.secondDigitBtn = QPushButton(Form)
-        self.secondDigitBtn.setObjectName(u"secondDigitBtn")
-
-        self.horizontalLayout_3.addWidget(self.secondDigitBtn)
-
-        self.secondCategoryBtn = QPushButton(Form)
-        self.secondCategoryBtn.setObjectName(u"secondCategoryBtn")
-
-        self.horizontalLayout_3.addWidget(self.secondCategoryBtn)
-
-        self.secondCharacterBtn = QPushButton(Form)
-        self.secondCharacterBtn.setObjectName(u"secondCharacterBtn")
-
-        self.horizontalLayout_3.addWidget(self.secondCharacterBtn)
-
-        self.secondFilterBtn = QPushButton(Form)
-        self.secondFilterBtn.setObjectName(u"secondFilterBtn")
-
-        self.horizontalLayout_3.addWidget(self.secondFilterBtn)
-
-
-        self.verticalLayout_4.addLayout(self.horizontalLayout_3)
-
-        self.label_4 = QLabel(Form)
-        self.label_4.setObjectName(u"label_4")
-
-        self.verticalLayout_4.addWidget(self.label_4)
-
-        self.secondCategoryBox = QTextEdit(Form)
-        self.secondCategoryBox.setObjectName(u"secondCategoryBox")
-        sizePolicy.setHeightForWidth(self.secondCategoryBox.sizePolicy().hasHeightForWidth())
-        self.secondCategoryBox.setSizePolicy(sizePolicy)
-
-        self.verticalLayout_4.addWidget(self.secondCategoryBox)
-
-        self.result_label_2 = QLabel(Form)
-        self.result_label_2.setObjectName(u"result_label_2")
-
-        self.verticalLayout_4.addWidget(self.result_label_2)
-
-        self.secondResultBox = QTextBrowser(Form)
-        self.secondResultBox.setObjectName(u"secondResultBox")
-
-        self.verticalLayout_4.addWidget(self.secondResultBox)
-
-        self.verticalLayout_4.setStretch(3, 1)
-        self.verticalLayout_4.setStretch(5, 5)
-
-        self.horizontalLayout_6.addLayout(self.verticalLayout_4)
-
-        self.verticalLayout_7 = QVBoxLayout()
-        self.verticalLayout_7.setObjectName(u"verticalLayout_7")
-        self.threeKeyBox = QLineEdit(Form)
-        self.threeKeyBox.setObjectName(u"threeKeyBox")
-
-        self.verticalLayout_7.addWidget(self.threeKeyBox)
-
-        self.horizontalLayout_4 = QHBoxLayout()
-        self.horizontalLayout_4.setObjectName(u"horizontalLayout_4")
-        self.threeDigitBtn = QPushButton(Form)
-        self.threeDigitBtn.setObjectName(u"threeDigitBtn")
-
-        self.horizontalLayout_4.addWidget(self.threeDigitBtn)
-
-        self.threeCategoryBtn = QPushButton(Form)
-        self.threeCategoryBtn.setObjectName(u"threeCategoryBtn")
-
-        self.horizontalLayout_4.addWidget(self.threeCategoryBtn)
-
-        self.threeCharacterBtn = QPushButton(Form)
-        self.threeCharacterBtn.setObjectName(u"threeCharacterBtn")
-
-        self.horizontalLayout_4.addWidget(self.threeCharacterBtn)
-
-        self.threeFilterBtn = QPushButton(Form)
-        self.threeFilterBtn.setObjectName(u"threeFilterBtn")
-
-        self.horizontalLayout_4.addWidget(self.threeFilterBtn)
-
-
-        self.verticalLayout_7.addLayout(self.horizontalLayout_4)
-
-        self.label_6 = QLabel(Form)
-        self.label_6.setObjectName(u"label_6")
-
-        self.verticalLayout_7.addWidget(self.label_6)
-
-        self.threeCategoryBox = QTextEdit(Form)
-        self.threeCategoryBox.setObjectName(u"threeCategoryBox")
-        sizePolicy.setHeightForWidth(self.threeCategoryBox.sizePolicy().hasHeightForWidth())
-        self.threeCategoryBox.setSizePolicy(sizePolicy)
-
-        self.verticalLayout_7.addWidget(self.threeCategoryBox)
-
-        self.result_label_3 = QLabel(Form)
-        self.result_label_3.setObjectName(u"result_label_3")
-
-        self.verticalLayout_7.addWidget(self.result_label_3)
-
-        self.threeResultBox = QTextBrowser(Form)
-        self.threeResultBox.setObjectName(u"threeResultBox")
-
-        self.verticalLayout_7.addWidget(self.threeResultBox)
-
-        self.verticalLayout_7.setStretch(3, 1)
-        self.verticalLayout_7.setStretch(5, 5)
-
-        self.horizontalLayout_6.addLayout(self.verticalLayout_7)
-
-        self.verticalLayout_10 = QVBoxLayout()
-        self.verticalLayout_10.setObjectName(u"verticalLayout_10")
-        self.fourKeyBox = QLineEdit(Form)
-        self.fourKeyBox.setObjectName(u"fourKeyBox")
-
-        self.verticalLayout_10.addWidget(self.fourKeyBox)
-
-        self.horizontalLayout_5 = QHBoxLayout()
-        self.horizontalLayout_5.setObjectName(u"horizontalLayout_5")
-        self.fourDigitBtn = QPushButton(Form)
-        self.fourDigitBtn.setObjectName(u"fourDigitBtn")
-
-        self.horizontalLayout_5.addWidget(self.fourDigitBtn)
-
-        self.fourCategoryBtn = QPushButton(Form)
-        self.fourCategoryBtn.setObjectName(u"fourCategoryBtn")
-
-        self.horizontalLayout_5.addWidget(self.fourCategoryBtn)
-
-        self.fourCharacterBtn = QPushButton(Form)
-        self.fourCharacterBtn.setObjectName(u"fourCharacterBtn")
-
-        self.horizontalLayout_5.addWidget(self.fourCharacterBtn)
-
-        self.fourFilterBtn = QPushButton(Form)
-        self.fourFilterBtn.setObjectName(u"fourFilterBtn")
-
-        self.horizontalLayout_5.addWidget(self.fourFilterBtn)
-
-
-        self.verticalLayout_10.addLayout(self.horizontalLayout_5)
-
-        self.label_8 = QLabel(Form)
-        self.label_8.setObjectName(u"label_8")
-
-        self.verticalLayout_10.addWidget(self.label_8)
-
-        self.fourCategoryBox = QTextEdit(Form)
-        self.fourCategoryBox.setObjectName(u"fourCategoryBox")
-        sizePolicy.setHeightForWidth(self.fourCategoryBox.sizePolicy().hasHeightForWidth())
-        self.fourCategoryBox.setSizePolicy(sizePolicy)
-
-        self.verticalLayout_10.addWidget(self.fourCategoryBox)
-
-        self.result_label_4 = QLabel(Form)
-        self.result_label_4.setObjectName(u"result_label_4")
-
-        self.verticalLayout_10.addWidget(self.result_label_4)
-
-        self.fourResultBox = QTextBrowser(Form)
-        self.fourResultBox.setObjectName(u"fourResultBox")
-
-        self.verticalLayout_10.addWidget(self.fourResultBox)
-
-        self.verticalLayout_10.setStretch(3, 1)
-        self.verticalLayout_10.setStretch(5, 5)
-
-        self.horizontalLayout_6.addLayout(self.verticalLayout_10)
-
-
-        self.verticalLayout_2.addLayout(self.horizontalLayout_6)
-
-
-        self.retranslateUi(Form)
-
-        QMetaObject.connectSlotsByName(Form)
-    # setupUi
-
-    def retranslateUi(self, Form):
-        Form.setWindowTitle(QCoreApplication.translate("Form", u"\u6570\u636e\u7b5b\u9009", None))
-        self.filePathBox.setPlaceholderText(QCoreApplication.translate("Form", u"\u8bf7\u9009\u62e9\u6587\u4ef6", None))
-        self.fileBtn.setText(QCoreApplication.translate("Form", u"\u6587\u4ef6\u9009\u62e9", None))
-        self.firstDigitBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u6570\u5b57", None))
-        self.firstCategoryBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u7c7b\u522b", None))
-        self.firstCharacterBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u82f1\u6587", None))
-        self.firstFilterBtn.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6", None))
-        self.label_2.setText(QCoreApplication.translate("Form", u"\u81ea\u5b9a\u4e49\u7c7b\u522b\u8bcd", None))
-        self.result_label_1.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6\u7ed3\u679c", None))
-        self.secondDigitBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u6570\u5b57", None))
-        self.secondCategoryBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u7c7b\u522b", None))
-        self.secondCharacterBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u82f1\u6587", None))
-        self.secondFilterBtn.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6", None))
-        self.label_4.setText(QCoreApplication.translate("Form", u"\u81ea\u5b9a\u4e49\u7c7b\u522b\u8bcd", None))
-        self.result_label_2.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6\u7ed3\u679c", None))
-        self.threeDigitBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u6570\u5b57", None))
-        self.threeCategoryBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u7c7b\u522b", None))
-        self.threeCharacterBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u82f1\u6587", None))
-        self.threeFilterBtn.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6", None))
-        self.label_6.setText(QCoreApplication.translate("Form", u"\u81ea\u5b9a\u4e49\u7c7b\u522b\u8bcd", None))
-        self.result_label_3.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6\u7ed3\u679c", None))
-        self.fourDigitBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u6570\u5b57", None))
-        self.fourCategoryBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u7c7b\u522b", None))
-        self.fourCharacterBtn.setText(QCoreApplication.translate("Form", u"\u63d2\u5165\u82f1\u6587", None))
-        self.fourFilterBtn.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6", None))
-        self.label_8.setText(QCoreApplication.translate("Form", u"\u81ea\u5b9a\u4e49\u7c7b\u522b\u8bcd", None))
-        self.result_label_4.setText(QCoreApplication.translate("Form", u"\u63d0\u53d6\u7ed3\u679c", None))
-    # retranslateUi
-

+ 0 - 332
src/tmp/filter/DataFilter.ui

@@ -1,332 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<ui version="4.0">
- <class>Form</class>
- <widget class="QWidget" name="Form">
-  <property name="windowModality">
-   <enum>Qt::NonModal</enum>
-  </property>
-  <property name="enabled">
-   <bool>true</bool>
-  </property>
-  <property name="geometry">
-   <rect>
-    <x>0</x>
-    <y>0</y>
-    <width>1546</width>
-    <height>514</height>
-   </rect>
-  </property>
-  <property name="minimumSize">
-   <size>
-    <width>800</width>
-    <height>400</height>
-   </size>
-  </property>
-  <property name="windowTitle">
-   <string>数据筛选</string>
-  </property>
-  <layout class="QVBoxLayout" name="verticalLayout_2">
-   <item>
-    <layout class="QHBoxLayout" name="horizontalLayout_2">
-     <item>
-      <widget class="QLineEdit" name="filePathBox">
-       <property name="enabled">
-        <bool>false</bool>
-       </property>
-       <property name="placeholderText">
-        <string>请选择文件</string>
-       </property>
-      </widget>
-     </item>
-     <item>
-      <widget class="QPushButton" name="fileBtn">
-       <property name="text">
-        <string>文件选择</string>
-       </property>
-      </widget>
-     </item>
-    </layout>
-   </item>
-   <item>
-    <layout class="QHBoxLayout" name="horizontalLayout_6">
-     <property name="sizeConstraint">
-      <enum>QLayout::SetDefaultConstraint</enum>
-     </property>
-     <item>
-      <layout class="QVBoxLayout" name="verticalLayout" stretch="0,0,0,1,0,5">
-       <item>
-        <widget class="QLineEdit" name="firstKeyBox"/>
-       </item>
-       <item>
-        <layout class="QHBoxLayout" name="horizontalLayout">
-         <item>
-          <widget class="QPushButton" name="firstDigitBtn">
-           <property name="text">
-            <string>插入数字</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="firstCategoryBtn">
-           <property name="text">
-            <string>插入类别</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="firstCharacterBtn">
-           <property name="text">
-            <string>插入英文</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="firstFilterBtn">
-           <property name="text">
-            <string>提取</string>
-           </property>
-          </widget>
-         </item>
-        </layout>
-       </item>
-       <item>
-        <widget class="QLabel" name="label_2">
-         <property name="text">
-          <string>自定义类别词</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextEdit" name="firstCategoryBox">
-         <property name="sizePolicy">
-          <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-           <horstretch>0</horstretch>
-           <verstretch>0</verstretch>
-          </sizepolicy>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QLabel" name="result_label_1">
-         <property name="text">
-          <string>提取结果</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextBrowser" name="firstResultBox">
-         <property name="sizePolicy">
-          <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-           <horstretch>0</horstretch>
-           <verstretch>0</verstretch>
-          </sizepolicy>
-         </property>
-        </widget>
-       </item>
-      </layout>
-     </item>
-     <item>
-      <layout class="QVBoxLayout" name="verticalLayout_4" stretch="0,0,0,1,0,5">
-       <item>
-        <widget class="QLineEdit" name="secondKeyBox"/>
-       </item>
-       <item>
-        <layout class="QHBoxLayout" name="horizontalLayout_3">
-         <item>
-          <widget class="QPushButton" name="secondDigitBtn">
-           <property name="text">
-            <string>插入数字</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="secondCategoryBtn">
-           <property name="text">
-            <string>插入类别</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="secondCharacterBtn">
-           <property name="text">
-            <string>插入英文</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="secondFilterBtn">
-           <property name="text">
-            <string>提取</string>
-           </property>
-          </widget>
-         </item>
-        </layout>
-       </item>
-       <item>
-        <widget class="QLabel" name="label_4">
-         <property name="text">
-          <string>自定义类别词</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextEdit" name="secondCategoryBox">
-         <property name="sizePolicy">
-          <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-           <horstretch>0</horstretch>
-           <verstretch>0</verstretch>
-          </sizepolicy>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QLabel" name="result_label_2">
-         <property name="text">
-          <string>提取结果</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextBrowser" name="secondResultBox"/>
-       </item>
-      </layout>
-     </item>
-     <item>
-      <layout class="QVBoxLayout" name="verticalLayout_7" stretch="0,0,0,1,0,5">
-       <item>
-        <widget class="QLineEdit" name="threeKeyBox"/>
-       </item>
-       <item>
-        <layout class="QHBoxLayout" name="horizontalLayout_4">
-         <item>
-          <widget class="QPushButton" name="threeDigitBtn">
-           <property name="text">
-            <string>插入数字</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="threeCategoryBtn">
-           <property name="text">
-            <string>插入类别</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="threeCharacterBtn">
-           <property name="text">
-            <string>插入英文</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="threeFilterBtn">
-           <property name="text">
-            <string>提取</string>
-           </property>
-          </widget>
-         </item>
-        </layout>
-       </item>
-       <item>
-        <widget class="QLabel" name="label_6">
-         <property name="text">
-          <string>自定义类别词</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextEdit" name="threeCategoryBox">
-         <property name="sizePolicy">
-          <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-           <horstretch>0</horstretch>
-           <verstretch>0</verstretch>
-          </sizepolicy>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QLabel" name="result_label_3">
-         <property name="text">
-          <string>提取结果</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextBrowser" name="threeResultBox"/>
-       </item>
-      </layout>
-     </item>
-     <item>
-      <layout class="QVBoxLayout" name="verticalLayout_10" stretch="0,0,0,1,0,5">
-       <item>
-        <widget class="QLineEdit" name="fourKeyBox"/>
-       </item>
-       <item>
-        <layout class="QHBoxLayout" name="horizontalLayout_5">
-         <item>
-          <widget class="QPushButton" name="fourDigitBtn">
-           <property name="text">
-            <string>插入数字</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="fourCategoryBtn">
-           <property name="text">
-            <string>插入类别</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="fourCharacterBtn">
-           <property name="text">
-            <string>插入英文</string>
-           </property>
-          </widget>
-         </item>
-         <item>
-          <widget class="QPushButton" name="fourFilterBtn">
-           <property name="text">
-            <string>提取</string>
-           </property>
-          </widget>
-         </item>
-        </layout>
-       </item>
-       <item>
-        <widget class="QLabel" name="label_8">
-         <property name="text">
-          <string>自定义类别词</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextEdit" name="fourCategoryBox">
-         <property name="sizePolicy">
-          <sizepolicy hsizetype="Expanding" vsizetype="Expanding">
-           <horstretch>0</horstretch>
-           <verstretch>0</verstretch>
-          </sizepolicy>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QLabel" name="result_label_4">
-         <property name="text">
-          <string>提取结果</string>
-         </property>
-        </widget>
-       </item>
-       <item>
-        <widget class="QTextBrowser" name="fourResultBox"/>
-       </item>
-      </layout>
-     </item>
-    </layout>
-   </item>
-  </layout>
- </widget>
- <resources/>
- <connections/>
-</ui>

+ 0 - 0
src/tmp/filter/__init__.py


+ 0 - 183
src/tmp/filter/main.py

@@ -1,183 +0,0 @@
-# -*- coding: utf-8 -*-
-import json
-import os.path
-import re
-import sys
-from functools import partial
-
-from PySide6.QtWidgets import QApplication, QMessageBox, QFileDialog, QWidget, QLineEdit, QPushButton, \
-  QTextEdit, QTextBrowser
-
-from src.tmp.filter.DataFilter import Ui_Form
-
-category_pattern = re.compile(r'\[类别\]')
-digit_pattern = re.compile(r'\[数字\]')
-english_pattern = re.compile(r'\[字母\]')
-
-CHARACTER_FILTER_STR = "[字母]"
-DIGIT_FILTER_STR = "[数字]"
-CATEGORY_FILTER_STR = "[类别]"
-
-CONFIG_FILE_PATH = "./config.json"
-CONFIG_ITEM_LAST_SELECT_FILE_PATH = "lastSelectFilePath"
-
-class MyMainForm(QWidget, Ui_Form):
-
-  def __init__(self, parent=None):
-    super(MyMainForm, self).__init__(parent)
-    self.setupUi(self)
-    self.bind()
-    self.loadConfig()
-
-  def bind(self):
-    self.toolDict = {
-      self.firstDigitBtn.objectName(): self.firstKeyBox,
-      self.firstCategoryBtn.objectName(): self.firstKeyBox,
-      self.firstCharacterBtn.objectName(): self.firstKeyBox,
-      self.secondDigitBtn.objectName(): self.secondKeyBox,
-      self.secondCategoryBtn.objectName(): self.secondKeyBox,
-      self.secondCharacterBtn.objectName(): self.secondKeyBox,
-      self.threeDigitBtn.objectName(): self.threeKeyBox,
-      self.threeCategoryBtn.objectName(): self.threeKeyBox,
-      self.threeCharacterBtn.objectName(): self.threeKeyBox,
-      self.fourDigitBtn.objectName(): self.fourKeyBox,
-      self.fourCategoryBtn.objectName(): self.fourKeyBox,
-      self.fourCharacterBtn.objectName(): self.fourKeyBox
-    }
-    self.resultDict = {
-      self.firstFilterBtn.objectName(): (self.firstKeyBox, self.firstCategoryBox, self.firstResultBox, None, self.result_label_1),
-      self.secondFilterBtn.objectName(): (self.secondKeyBox, self.secondCategoryBox, self.secondResultBox, self.firstResultBox, self.result_label_2),
-      self.threeFilterBtn.objectName(): (self.threeKeyBox, self.threeCategoryBox, self.threeResultBox, self.secondResultBox, self.result_label_3),
-      self.fourFilterBtn.objectName(): (self.fourKeyBox, self.fourCategoryBox, self.fourResultBox, self.threeResultBox, self.result_label_4)
-    }
-
-    self.fileBtn.clicked.connect(self.selectFile)
-
-    self.firstCategoryBtn.clicked.connect(partial(self.add_filter_str, self.firstCategoryBtn, CATEGORY_FILTER_STR))
-    self.firstDigitBtn.clicked.connect(partial(self.add_filter_str, self.firstDigitBtn, DIGIT_FILTER_STR))
-    self.firstCharacterBtn.clicked.connect(partial(self.add_filter_str, self.firstCharacterBtn, CHARACTER_FILTER_STR))
-    self.firstFilterBtn.clicked.connect(partial(self.submit, self.firstFilterBtn))
-
-    self.secondCategoryBtn.clicked.connect(partial(self.add_filter_str, self.secondCategoryBtn, CATEGORY_FILTER_STR))
-    self.secondDigitBtn.clicked.connect(partial(self.add_filter_str, self.secondDigitBtn, DIGIT_FILTER_STR))
-    self.secondCharacterBtn.clicked.connect(partial(self.add_filter_str, self.secondCharacterBtn, CHARACTER_FILTER_STR))
-    self.secondFilterBtn.clicked.connect(partial(self.submit, self.secondFilterBtn))
-
-    self.threeCategoryBtn.clicked.connect(partial(self.add_filter_str, self.threeCategoryBtn, CATEGORY_FILTER_STR))
-    self.threeDigitBtn.clicked.connect(partial(self.add_filter_str, self.threeDigitBtn, DIGIT_FILTER_STR))
-    self.threeCharacterBtn.clicked.connect(partial(self.add_filter_str, self.threeCharacterBtn, CHARACTER_FILTER_STR))
-    self.threeFilterBtn.clicked.connect(partial(self.submit, self.threeFilterBtn))
-
-    self.fourCategoryBtn.clicked.connect(partial(self.add_filter_str, self.fourCategoryBtn, CATEGORY_FILTER_STR))
-    self.fourDigitBtn.clicked.connect(partial(self.add_filter_str, self.fourDigitBtn, DIGIT_FILTER_STR))
-    self.fourCharacterBtn.clicked.connect(partial(self.add_filter_str, self.fourCharacterBtn, CHARACTER_FILTER_STR))
-    self.fourFilterBtn.clicked.connect(partial(self.submit, self.fourFilterBtn))
-
-  def loadConfig(self):
-    if os.path.isfile(CONFIG_FILE_PATH):
-      with open(CONFIG_FILE_PATH, 'r', encoding='utf-8') as f:
-        config = json.loads(f.read())
-        self.filePathBox.setText(config[CONFIG_ITEM_LAST_SELECT_FILE_PATH])
-
-
-  def selectFile(self):
-    file_path, file_type = QFileDialog.getOpenFileName(self, "选择文件")
-    with open(CONFIG_FILE_PATH, 'w', encoding='utf-8') as f:
-      f.write(json.dumps({CONFIG_ITEM_LAST_SELECT_FILE_PATH: file_path}))
-    self.filePathBox.setText(file_path)
-
-  def add_filter_str(self, btn_widget: QPushButton, filter_text):
-    key_box = self.toolDict[btn_widget.objectName()]
-    key_box.setText(key_box.text() + filter_text)
-
-  def submit(self, filter_btn: QPushButton):
-
-    key_box, category_box, result_box, parent_result_box, result_label = self.resultDict[filter_btn.objectName()]
-
-    if not self.check(key_box, category_box, parent_result_box):
-      return
-
-    before_filter_cnt, after_filter_cnt, filter_result_arr = self.deal(key_box, category_box, parent_result_box)
-    result_label.setText("提取结果:原始数据%s条,筛选后%s条" % (before_filter_cnt, after_filter_cnt))
-    result_box.setText("\n".join(filter_result_arr))
-
-  def check(self, key_box: QLineEdit, category_box: QTextEdit, parent_result_box: QTextBrowser):
-    key_text = key_box.text()
-    if len(key_text) == 0:
-      QMessageBox.warning(self, "输入提示", "请输入待筛选关键词")
-      return False
-
-    cnt = 0
-    for pattern in [category_pattern, digit_pattern, english_pattern]:
-      if pattern.search(key_text) is not None:
-        cnt = cnt + 1
-      if cnt > 1:
-        QMessageBox.warning(self, "提示", "一次只能使用一种正则筛选项")
-        return False
-
-    category_text = category_box.toPlainText()
-    if category_pattern.search(key_text) is not None and len(category_text) == 0:
-      QMessageBox.warning(self, "提示", "使用类别筛选,请输入待筛选的类别关键词")
-      return False
-
-    if parent_result_box is None:
-      file_path = self.filePathBox.text()
-      if len(file_path) == 0:
-        QMessageBox.warning(self, "提示", "请选择带筛选文件")
-        return False
-    elif len(parent_result_box.toPlainText()) == 0:
-      QMessageBox.warning(self, "提示", "上级结果中没有数据")
-      return False
-
-    return True
-
-  def deal(self, key_box: QLineEdit, category_box: QTextEdit, parent_result_box: QTextBrowser):
-    key_text = key_box.text()
-
-    parent_key_arr = None
-    if parent_result_box is None:
-      with open(self.filePathBox.text(), 'r', encoding='utf-8') as f:
-        parent_key_arr = [content.replace("\n", "") for content in f.readlines()]
-    else:
-      parent_key_arr = parent_result_box.toPlainText().split("\n")
-
-    filter_result_arr = None
-    if category_pattern.search(key_text) is not None:
-      filter_result_arr = set()
-      categoryKeyArray = category_box.toPlainText().splitlines()
-      for categoryKey in categoryKeyArray:
-        filter_result_arr.update(self.filter(parent_key_arr, key_text, "类别", categoryKey))
-    elif digit_pattern.search(key_text) is not None:
-      filter_result_arr = self.filter(parent_key_arr, key_text, "数字", "0-9")
-    elif english_pattern.search(key_text) is not None:
-      filter_result_arr = self.filter(parent_key_arr, key_text, "字母", "A-Za-z")
-    else:
-      filter_result_arr = self.filter(parent_key_arr, key_text)
-
-    return len(parent_key_arr), len(filter_result_arr), filter_result_arr
-
-  def filter(self, originArray, inputText, oldStr=None, newStr=None):
-    resultArray = []
-    key_pattern = None
-    filter_pattern = None
-    if oldStr is not None and len(oldStr) > 0:
-      key_pattern = re.compile(inputText.replace("[{}]".format(oldStr), ""))
-    else:
-      key_pattern = re.compile(inputText)
-    if newStr is not None and len(newStr) > 0:
-      filter_pattern = re.compile("[{}]".format(newStr))
-    for originKey in originArray:
-      if key_pattern.search(originKey) is not None:
-        if filter_pattern is not None:
-          if filter_pattern.search(originKey) is not None:
-            resultArray.append(originKey)
-        else:
-          resultArray.append(originKey)
-    return resultArray
-
-
-if __name__ == "__main__":
-  app = QApplication(sys.argv)
-  myWin = MyMainForm()
-  myWin.show()
-  sys.exit(app.exec())

+ 0 - 147
src/tmp/key.py

@@ -1,147 +0,0 @@
-# -*- coding:utf-8 -*-
-
-from concurrent.futures import ProcessPoolExecutor, as_completed
-import logging
-import os
-from time import time
-import config
-import tools
-import jieba
-import mmap
-
-# 优化
-# 1. 更改为使用多进程
-
-# 日志配置初始化
-tools.init_log()
-
-def sub_process(start_pos, end_pos, stop_word):
-    """
-    子进程
-    """
-    pid = os.getpid()
-
-    logging.debug("子进程-%d 开始执行分词任务,开始位置:%d,结束位置:%d" % (pid, start_pos, end_pos))
-
-    # 临时容器
-    tmp_list = []
-
-    # 开始时间
-    start_time = time()
-    
-    with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as f, \
-        mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-
-        fmmap.seek(start_pos)
-
-        while True:
-            # 越界检测
-            cur_pos = fmmap.tell()
-            if cur_pos >= end_pos:
-                break
-
-            # 读取关键词
-            key = fmmap.readline().decode("UTF-8").replace("\r","").replace("\n","")
-
-            # 读取不到任何内容结束执行
-            if not key :
-                continue
-                
-            # 分词
-            tmp_stems = list(jieba.cut_for_search(key))
-
-            # 排除停用词
-            stems = set()
-            for stem in tmp_stems:
-                if stem in stop_word:
-                    continue
-                stems.add(stem)
-                
-            # 以防止词根数为0
-            if len(stems) == 0:
-                continue
-        
-            tmp_list.append((key , list(stems)))
-    
-    logging.debug("子进程-%d 执行分词任务结束,耗时:%f" % (pid, (time() - start_time)))
-    
-    return tmp_list
-
-def main_process():
-    """
-    主进程
-    """
-
-    # 进程池数
-    process_num = 4
-
-    # 任务分割大小
-    split_num = 500000
-
-    # 位置信息索引
-    pos_index = []
-
-    # 总关键词数量
-    total_num = 0
-
-    # 加载停用词
-    stop_word = tools.load_stop_word()
-
-    start_time = time()
-
-    # 记录位置信息
-    logging.info("主进程 开始构建位置索引信息")
-    with open(config.MERGE_FILE, "r", encoding=config.ENCODING_CHARSET) as f, \
-        mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-
-        while True:
-            # 获取当前位置
-            cur_pos = fmmap.tell()
-            # 移动到一下行
-            line = fmmap.readline()
-            # 结束检测
-            if not line:
-                break
-            # 记录
-            pos_index.append(cur_pos)
-
-        # 计算总关键词数量
-        total_num = len(pos_index)
-
-    # 划分子任务
-    logging.info("主进程 开始划分子任务")
-    tasks = tools.avg_split_task(total_num, split_num)
-
-    with ProcessPoolExecutor(process_num) as process_pool, \
-        open(config.KEY_FILE, "w", encoding=config.ENCODING_CHARSET) as f_key:
-
-        logging.info("主进程 提交任务到子进程")
-        process_futures = [process_pool.submit(sub_process, pos_index[task[0]], pos_index[task[1]], stop_word) for task in tasks]
-
-        # 移除无效变量 以防占用内存
-        del pos_index
-        del tasks
-
-        # 序号计算
-        count = -1 
-        for p_future in as_completed(process_futures):
-            result = p_future.result()
-            if result:
-                for key, stems in result:
-                    count = count + 1
-                    # 写入文件中
-                    f_key.write("%d,%s,%s\n"%(count, key, list(stems)))
-            
-            # 移除无效变量 以防占用内存
-            process_futures.remove(p_future)
-    
-    logging.info("主进程 构建KEY表耗时:%f" % (time() - start_time))
-
-if __name__ == '__main__':
-
-    TITLE = "关键词表 生成"
-    tools.log_start_msg(TITLE)
-
-    main_process()
-
-    tools.log_end_msg(TITLE)

+ 0 - 53
src/tmp/key_index.py

@@ -1,53 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import config
-import tools
-import mmap
-
-
-def main():
-    
-    # 关键词索引容器,
-    key_index = []
-
-    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as fkey, \
-        mmap.mmap(fkey.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-        
-        # 总大小
-        total_num = fmmap.size()
-
-        while True:
-            # 读取光标位置
-            cur_pos = fmmap.tell()
-            # 把光标移动到下一行
-            line = fmmap.readline()
-            # 如果没有数据则结束
-            if not line :
-                break
-
-            # 建立关键词序号和位置的关系,以索引当行号(0基)
-            key_index.append(cur_pos)
-
-            # 进度显示
-            tools.tip_in_size(total_num, cur_pos)
-        
-    with open("./data/tmp/key_index_test.csv", "w", encoding=config.ENCODING_CHARSET) as f:
-        f.write(",".join([str(i) for i in key_index]))
-
-        
-    # 保存索引
-    # tools.save_obj(config.KEY_INDEX_CACHE, key_index)
-
-
-if __name__ == '__main__':
-
-    TITLE = "关键词索引"
-
-    # 日志配置初始化
-    tools.init_log()
-    tools.log_start_msg(TITLE)
-
-    main()
-    
-    tools.log_end_msg(TITLE)
-    

+ 0 - 159
src/tmp/key_reverse.py

@@ -1,159 +0,0 @@
-# -*- coding:utf-8 -*-
-
-from concurrent.futures import ProcessPoolExecutor, as_completed
-import math
-from time import time
-import os
-import config
-import tools
-import re
-import logging
-import mmap
-
-
-tools.init_log()
-
-if __name__ != "__main__":
-
-    # 正则提取
-    # 倒排表 索引
-    index_re = r"(\d+),"
-    index_pattern = re.compile(index_re, re.I)
-
-    # KEY表 词根
-    stem_re = r"'([^,]*)'"
-    stem_pattern = re.compile(stem_re, re.I)
-
-def sub_process(start_pos, end_pos):
-    """
-    子进程
-    """
-    pid = os.getpid()
-
-    logging.debug("进程-%d 开始执行任务,开始位置:%d,结束位置:%d" % (pid, start_pos, end_pos))
-
-    # 开始时间
-    start_time = time()
-
-    # 倒排表和统计信息容器
-    reverse_dict = {}
-
-    with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as f_key, \
-        mmap.mmap(f_key.fileno(), 0, access=mmap.ACCESS_READ) as f_mmap:
-        # 移动到开始位置
-        f_mmap.seek(start_pos)
-
-        while True:
-            # 获取当前处理位置
-            cur_pos = f_mmap.tell()
-
-            # 越界检查
-            if cur_pos >= end_pos:
-                break
-
-            # 提取数据
-            line = f_mmap.readline().decode(config.ENCODING_CHARSET)
-            m = index_pattern.match(line)
-            # 获取关键词序号、词根
-            index = m.group(1)
-            stems = stem_pattern.findall(line)
-
-            # 构建倒排表和统计数据量
-            for stem in stems:
-                obj = reverse_dict.get(stem)
-                if obj:
-                    obj["count"] = obj["count"] + 1
-                    obj["indexs"].add(index) 
-                else:
-                    tmp_indexs = set()
-                    tmp_indexs.add(index)
-                    reverse_dict[stem]= {
-                        "count": 1,
-                        "indexs": tmp_indexs
-                    }
-    
-    logging.debug("子进程-%d 任务结束,耗时:%f" % (pid, (time() - start_time)))
-
-    return reverse_dict
-    
-
-def main_process():
-    
-    logging.info("主进程 开始执行初始化")
-    
-    # 进程处理数
-    process_num = 4
-
-    # 关键表索引
-    key_index = tools.load_obj(config.KEY_INDEX_CACHE)
-
-    # 开始时间
-    start_time = time()
-
-    # 关键词总数
-    total_num = len(key_index)
-
-    # 任务分割大小
-    split_num = math.ceil(total_num/process_num)
-
-    logging.info("主进程 开始划分子任务")
-    tasks = tools.avg_split_task(total_num, split_num)
-    
-    with ProcessPoolExecutor(process_num) as process_pool:
-
-        logging.info("主进程 提交任务到子进程")
-        process_futures = [process_pool.submit(sub_process, key_index[task[0]], key_index[task[1]]) for task in tasks]
-
-        # 移除无效变量 以防占用内存
-        del tasks
-        del key_index
-
-         # 倒排表和统计信息容器
-        reverse_dict = {}
-        
-        # 进行数据合并
-        for p_future in as_completed(process_futures):
-            result = p_future.result()
-            for key, val_obj in result.items():
-                reverse_obj = reverse_dict.get(key)
-                if reverse_obj:
-                    reverse_obj["count"] = reverse_obj["count"] + val_obj["count"]
-                    reverse_obj["indexs"] = reverse_obj["indexs"] | val_obj["indexs"]
-                else:
-                    reverse_dict[key] = val_obj
-            
-            # 移除无效变量 以防占用内存
-            process_futures.remove(p_future)
-        
-        logging.info("主进程 已获取全部子进程返回结果,总数据量:%d" % len(reverse_dict))
-    
-    logging.info("主进程 对词根关联的索引进行排序和转换")
-    for val_obj in reverse_dict.values():
-        val_obj["indexs"] = list(val_obj["indexs"])
-        val_obj["indexs"].sort()
-
-    # 根据关键词数量进行排序,这里通过items()方法转成元组列表,才能进行排序
-    logging.info("主进程 根据关键词数量进行排列")
-    sorted_reverse_list = sorted(reverse_dict.items(), key=lambda x: x[1]["count"], reverse=True)
-
-    # 保存到本地文件
-    logging.info("主进程 保存到本地")
-    with open("./data/tmp/reverse_test.csv", "w", encoding=config.ENCODING_CHARSET) as f_reverse, \
-        open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as f_statistics:
-        for key, val_obj in sorted_reverse_list:
-            f_reverse.write("%s,%s\n" % (key, val_obj["indexs"]))
-            f_statistics.write("%s,%d\n" % (key, val_obj["count"]))
-    
-    logging.info("主进程 构建倒排索引耗时:%f" % (time() - start_time))
-
-
-if __name__ == "__main__":
-    
-    TITLE = "生成关键词倒排和统计信息"
-    tools.log_start_msg(TITLE)
-
-    main_process()
-
-    tools.log_end_msg(TITLE)
-
-   

+ 0 - 51
src/tmp/key_reverse_index.py

@@ -1,51 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import config
-import tools
-import mmap
-
-TITLE = "关键词倒排索引"
-
-def main():
-    # 日志配置初始化
-    tools.init_log()
-    tools.log_start_msg(TITLE)
-
-    # 关键词倒排索引容器
-    reverse_index = []
-
-    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
-        mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-        
-        # 总大小
-        total_num = fmmap.size()
-
-        while True:
-            # 读取光标位置
-            cur_pos = fmmap.tell()
-            # 把光标移动到下一行
-            line = fmmap.readline().decode(config.ENCODING_CHARSET)
-
-            # 如果没有数据则结束
-            if not line :
-                break
-            
-            # 获取词根位置,建立词根和位置的关系
-            index = line.index(",")
-            key = line[:index]
-            next_pos = fmmap.tell()
-            reverse_index.append((key, cur_pos, next_pos))
-            
-            # 进度显示
-            tools.tip_in_size(total_num, cur_pos)
-        
-        # 保存索引
-        with open("./data/tmp/reverse_index_test.csv", "w", encoding=config.ENCODING_CHARSET) as f:
-            for key, cur_pos, next_pos in reverse_index:
-                f.write("%s,%d,%d\n" % (key, cur_pos, next_pos))
-        # tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
-
-    tools.log_end_msg(TITLE)
-
-if __name__ == "__main__":
-    main()

+ 0 - 211
src/tmp/key_reverse_statistics.py

@@ -1,211 +0,0 @@
-# -*- coding:utf-8 -*-
-
-from concurrent.futures import ProcessPoolExecutor, as_completed
-import mmap
-import os
-import config
-import tools
-import ast
-import logging
-import math
-
-TITLE = "关键词倒排文件 统计"
-
-# def reverse_statistics(start_pos, end_pos):
-
-def handle(start_pos, end_pos):
-
-    print("进程:%d, 统计开始,开始位置:%d,结束位置:%d" % (os.getpid(), start_pos, end_pos))
-
-    # 统计信息容器
-    reverse_statistics = {}
-
-    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
-        mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
-        # 调整开始位置
-        fmmap.seek(start_pos)
-
-        while True:
-            cur_pos = fmmap.tell()
-            # 越界检测
-            if cur_pos >= end_pos:
-                break
-
-            line = fmmap.readline().decode(config.ENCODING_CHARSET)
-            index=line.index(",")
-            key = line[:index]
-            word_root = line[index+1:]
-            word_root = ast.literal_eval(word_root)
-            l = len(word_root)
-
-            reverse_statistics[key]=l
-
-    logging.info("进程:%d, 统计结束" % os.getpid())
-
-    return {
-        "pid":os.getpid(),
-        "statistics":reverse_statistics
-    }
-
-
-def main2():
-    # 日志信息配置
-    tools.init_log()
-    tools.log_start_msg(TITLE)
-
-    # 进程数
-    process_num = os.cpu_count()
-
-    # 加载缓存索引文件
-    key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
-
-    # 对索引文件中的元素进行平分
-
-    # 转成列表,计算总长 和 平分后的处理区间
-    key_list = [key for key in key_reverse_index.keys()]
-    key_list_len = len(key_list)
-    internal = math.ceil(key_list_len / process_num )
-
-    # 利用 缓存索引文件 生成处理区间的位置信息
-    # 位置信息容器
-    pos_list = []
-    for i in range(process_num + 1):
-        # 计算平分点在列表中的位置
-        l_pos = i * internal
-        # 如果超过列表大小需要额外处理
-        if l_pos > key_list_len:
-            l_pos = key_list_len -1
-        # 获取列表中的词根
-        key = key_list[l_pos:l_pos+1]
-        # 根据词根获取位置信息
-        pos = key_reverse_index[key[0]]
-        # 记录位置信息
-        pos_list.append(pos)
-
-
-    # 使用用进程池
-    pool = ProcessPoolExecutor(process_num)
-    # 生成任务
-    process_futures = []
-    for i in range(0, len(pos_list)-1):
-        pos = pos_list[i: i+2]
-        process_futures.append(pool.submit(handle, pos[0], pos[1]))
-
-    # with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
-    #     for future in as_completed(process_futures):
-    #         logging.info("部分子任务统计结束,保存至本地 - 开始")
-    #         for key, value in future.result().items():
-    #             fw.write("%s,%s\n"%(key,value))
-    #         logging.info("部分子任务统计结束,保存至本地 - 结束")
-
-
-    results = []
-    for future in as_completed(process_futures):
-        result = future.result()
-        logging.info("进程:%d, 统计结束" % result["pid"])
-        results.append(result)
-
-    logging.info("统计结束,保存至本地 - 开始")
-    with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
-        for r in results:
-            for key, value in r["statistics"].items():
-                fw.write("%s,%s\n"%(key,value))
-    logging.info("部分子任务统计结束,保存至本地 - 结束")
-
-    pool.shutdown(wait=True)
-
-    tools.log_end_msg(TITLE)
-
-    # 测试代码3
-    # pool = ProcessPoolExecutor(3)
-    # for i in range(1,5):
-    #     pool.submit(handle, "测试进程-%d"%i, i, i*10)
-
-    # pool.shutdown(wait=True)
-
-    # 测试代码2
-    # pool = Pool(3)
-    # for i in range(1,5):
-    #     pool.apply_async(handle, ("测试进程-%d"%i, i, i*10))
-    # pool.close()
-    # pool.join()
-    # print("结束")
-
-    # 测试代码1
-    # p = Process(target=handle, args=('测试进程', 1, 10))
-    # p.start()
-    # p.join()
-
-    # tools.init_log()
-    # tools.log_start_msg(TITLE)
-
-    # key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
-
-    # tmp = [key for key in key_reverse_index.keys()]
-
-    # l = len(tmp)
-    # print("总长:", l)
-    # internal = math.ceil(l / 4)
-    # print("间隔:", internal)
-    # pos = []
-    # for i in range(5):
-    #     t = i*internal
-    #     if t > l:
-    #         t = l-1
-    #     pos.append(t)
-    # print(pos)
-
-    # for item in pos:
-    #     key = tmp[item:item+1]
-    #     print(key)
-    #     pos = key_reverse_index[key[0]]
-    #     print(key, pos)
-
-
-    # reverse_statistics = {}
-    # logging.info("统计开始")
-    # with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
-    #     mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
-    #     for line in fr:
-    #         index=line.index(",")
-    #         key = line[:index]
-    #         word_root = line[index+1:]
-    #         word_root = ast.literal_eval(word_root)
-    #         l = len(word_root)
-
-    #         reverse_statistics[key]=l
-
-    # logging.info("统计结束,保存至本地")
-    # with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
-    #     for key, value in reverse_statistics:
-    #         fw.write("%s,%s\n"%(key,value))
-
-    # tools.log_end_msg(TITLE)
-
-
-def main():
-    tools.init_log()
-    tools.log_start_msg(TITLE)
-
-    reverse_statistics = {}
-    logging.info("统计开始")
-    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
-        mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
-        for line in fr:
-            index=line.index(",")
-            key = line[:index]
-            word_root = line[index+1:]
-            word_root = ast.literal_eval(word_root)
-            l = len(word_root)
-
-            reverse_statistics[key]=l
-
-    logging.info("统计结束,保存至本地")
-    with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
-        for key, value in reverse_statistics:
-            fw.write("%s,%s\n"%(key,value))
-
-    tools.log_end_msg(TITLE)
-
-if __name__ == "__main__":
-    main2()

+ 0 - 97
src/tmp/merge.py

@@ -1,97 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import config
-import os
-import tools
-import logging
-import zipfile
-
-
-# 待合并的文件目录
-DATA_DIR = "E:\Download\长尾关键词\长尾关键词-什么\普通-p"
-
-def get_files(path):
-    '''
-    读取文件夹下的文件列表
-    '''
-    file_list = []
-    for file in os.listdir(path):
-        file_list.append(os.path.join(path,file))
-    return file_list
-
-def merge_file_content():
-    """
-    合并文件下的所有文件中的内容(仅限关键词)
-
-    Parameters
-    ----------
-    dir_path : string
-        待读取的文件夹
-
-    dest_file : string
-        合并后输出的文件
-
-    exclude_file : list
-        跳过压缩文件中的文件
-    ----------
-    """
-    # 获取文件列表
-    files = get_files(DATA_DIR)
-
-    # 总文件数
-    total_num = len(files)
-    logging.info("待处理文件数:%d" % total_num)
-
-    # 排重过滤
-    repeat_set = set()
-
-    # 关键词排重前总数
-    total_count=0
-
-    # 读取数据并进行排重
-    for i, file in enumerate(files):
-        zfile = zipfile.ZipFile(file)
-        filenames = zfile.namelist()
-        for filename in filenames:
-
-            # 重新编码文件名为正确形式
-            realname = filename.encode('cp437').decode('gbk')
-
-            # 排除无效文件
-            if realname in config.MERGE_EXCLUDE_FILES:
-                continue
-
-            logging.info("正在处理文件: %s" % realname)
-
-            # 读取压缩文件中的文件
-            with zfile.open(filename) as file_content:
-                lines = file_content.readlines()
-                # 跳过开头两行
-                for line in lines[2:]:
-                    split = line.decode("gbk").split(",")
-                    # 只需要第一列的数据
-                    repeat_set.add(split[0])
-                    # 记录次数
-                    total_count = total_count + 1
-
-        tools.tip(total_num, i)
-
-    logging.info("正在保存合并结果,文件位置:%s,排重前数据量:%d,排重后数据量:%d" % (
-      config.MERGE_FILE, total_count, len(repeat_set)))
-    with open(config.MERGE_FILE, "w", encoding="utf-8") as f:
-        for item in repeat_set:
-            f.write(item)
-            f.write("\n")
-
-
-if __name__ == '__main__':
-
-    TITLE= "拓展词合并"
-
-    # 日志初始化
-    tools.init_log()
-    tools.log_start_msg(TITLE)
-
-    merge_file_content()
-
-    tools.log_end_msg(TITLE)

+ 0 - 58
src/tmp/split.py

@@ -1,58 +0,0 @@
-# -*-: coding:utf-8 -*-
-
-import csv
-import re
-
-def split():
-    INPUT_DATA = r"./data/agg_filter.csv"
-    OUTPUT_TEMP = "./data/split/agg_split_%d.txt"
-    OUTPUT_TEMP2 = "./data/split/agg_split_%d_%d.txt"
-
-    startPattern = re.compile("######开始######")
-    con_l = []
-    sub = None
-    with open(INPUT_DATA, "r", encoding="GBK") as fr:
-        for line in fr.readlines():
-            tl = startPattern.findall(line)
-            if len(tl) > 0:
-                sub = []
-                sub.append(line)
-                con_l.append(sub)
-            elif line.startswith("\n"):
-                continue
-            else:
-                sub.append(line)
-    
-    # step = 71500
-    # for i, v in enumerate(range(0, len(con_l), step)):
-    #     with open(OUTPUT_TEMP % (i+1), "w", encoding="GBK") as fw:
-    #         for ele in con_l[v:v+step]:
-    #             if len(ele) == 1:
-    #                 continue
-
-    #             fw.write("\n")
-    #             for content in ele:
-    #                 fw.write(content)
-    filter_l = [
-        (1000, 1000, [subList for subList in con_l if len(subList)>=1000]),
-        (500, 1000, [subList for subList in con_l if len(subList)>=500 and len(subList) < 1000]),
-        (100,500,[subList for subList in con_l if len(subList)>=100 and len(subList) < 500]),
-        (50,100,[subList for subList in con_l if len(subList)>=50 and len(subList)<100]),
-        (10,50,[subList for subList in con_l if len(subList)>=10 and len(subList)<50]),
-        (5,10,[subList for subList in con_l if len(subList)>=5 and len(subList)<10]),
-        (3,5,[subList for subList in con_l if len(subList)>=3 and len(subList)<5]),
-        (2,2,[subList for subList in con_l if len(subList)==2])
-        # (1,1,[subList for subList in con_l if len(subList)==1])
-    ]
-
-    for start, end, sublist in filter_l:
-        with open(OUTPUT_TEMP2 % (start, end), "w", encoding="GBK") as fw:
-            for ele in sublist:
-                fw.write("\n")
-                for content in ele:
-                    fw.write(content)
-    
-    
-   
-if __name__ == '__main__':
-    split()

+ 0 - 141
src/tmp/statistics.py

@@ -1,141 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import tools
-import config
-import logging
-import random
-import time
-import ast
-import mmap
-
-TASK_TITLE = "数据统计分析"
-
-def transfer_str(num):
-    msg = None
-    if num >= 10000:
-        msg = "%d万%d" % (num//10000, num%10000)
-    else:
-        msg = str(num)
-    return msg
-
-def cal(list):
-    list_len = len(list)
-    list_count = sum(list)
-    sum_msg = transfer_str(list_len)
-    count_msg = transfer_str(list_count)
-    avg_msg = transfer_str(int(list_count/list_len))
-    return sum_msg, count_msg, avg_msg
-
-def tip(condition, list):
-    logging.info("条件:%s - 涉及:%s个词根,涉及词数:%s,平均约:%s 词数/词根" % ((condition,)+ cal(list)))
-
-def test_tip(list, ele_num):
-    start =time.time()
-    tmp = ast.literal_eval(str(random.sample(list, ele_num)))
-    end =time.time()
-    logging.info("%s个元素的字符列表转换成对象耗时%s" % (transfer_str(ele_num), end-start))
-
-def cost_statistics():
-    with open(config.KEY_REVERSE_STATISTICS_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
-        count_list= []
-        total_count=0
-        for line in f:
-            first_index = line.index(",")
-            count = int(line[first_index+1:])
-            count_list.append(count)
-            total_count = total_count + count
-
-        logging.info("总祠根数:%d, 涉及的总分词查找数:%d" % (len(count_list), total_count))
-
-        tip("等于1", [val for val in count_list if val == 1])
-
-        tip("大于1小于100", [val for val in count_list if val > 1 and val < 100])
-
-        tip("大于等于100小于200", [val for val in count_list if val >= 100 and val < 200])
-
-        tip("大于等于200小于300", [val for val in count_list if val >= 200 and val < 300])
-
-        tip("大于等于300小于400", [val for val in count_list if val >= 300 and val < 400])
-
-        tip("大于等于400小于500", [val for val in count_list if val >= 400 and val < 500])
-
-        tip("大于等于500小于1000", [val for val in count_list if val >= 500 and val < 1000])
-
-        tip("大于等于1000小于5000", [val for val in count_list if val >= 1000 and val < 5000])
-
-        tip("大于等于5000小于1万", [val for val in count_list if val >= 5000 and val < 10000])
-
-        tip("大于等于1万小于5万", [val for val in count_list if val >= 10000 and val < 50000])
-
-        tip("大于等于5万小于10万", [val for val in count_list if val >= 50000 and val < 100000])
-
-        tip("大于等于10万", [val for val in count_list if val >= 100000])
-
-        sample_list = [i for i in range(14500029)]
-        test_tip(sample_list, 1)
-        test_tip(sample_list, 10)
-        test_tip(sample_list, 50)
-        test_tip(sample_list, 100)
-        test_tip(sample_list, 200)
-        test_tip(sample_list, 300)
-        test_tip(sample_list, 400)
-        test_tip(sample_list, 500)
-        test_tip(sample_list, 1000)
-        test_tip(sample_list, 5000)
-        test_tip(sample_list, 10000)
-        test_tip(sample_list, 50000)
-        test_tip(sample_list, 100000)
-        test_tip(sample_list, 595528)
-        test_tip(sample_list, 689520)
-        test_tip(sample_list, 776035)
-        test_tip(sample_list, 822266)
-        test_tip(sample_list, 951491)
-
-def memory_statistics():
-    key_reverse_index_cache = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
-    end_pos = key_reverse_index_cache["导不出"]
-    logging.info("查找结束位置")
-    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
-        mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
-
-        logging.info("开始构建缓存")
-        cache = {}
-        start = time.time()
-        while True:
-            cur_pos = fmmap.tell()
-
-            if cur_pos > end_pos:
-                break
-
-            line = fmmap.readline().decode("UTF-8")
-            first_index = line.index(",")
-            key = line[:first_index]
-
-            # 转换
-            word_root = line[first_index+1:]
-            cache[key]=ast.literal_eval(word_root)
-
-        end = time.time()
-        logging.info('构建热点缓存完成,耗时:%s,缓存数量:%d' % ((end-start), len(cache)))
-
-        logging.info('把缓存保存到本地')
-        tools.save_obj(config.KEY_REVERSE_INDEX_HOT_CACHE, cache)
-        logging.info('保存结束')
-
-        time.sleep(20)
-        logging.info('留20s进行内存观察')
-
-
-def main():
-
-    tools.init_log()
-    tools.log_start_msg(TASK_TITLE)
-
-    memory_statistics()
-
-
-    tools.log_end_msg(TASK_TITLE)
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 194
src/tmp/tools.py

@@ -1,194 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import math
-import logging
-import os
-import config
-import logging.config
-import pickle
-
-TITLE = "工具类"
-
-tip_internal_cache = {}
-
-def init_log():
-    """
-    日志初始化工具
-    """
-    # 读取日志配置文件内容
-    logging.config.fileConfig('logging.conf')
-
-    # 用一个没有在配置文件中定义的logger名称来创建一个日志器logger
-    return logging.getLogger()
-
-def log_start_msg(msg):
-    """
-    执行开始时的简易日志输出
-    """
-    logging.info("-----------------%s 开始-----------------" % msg)
-
-def log_end_msg(msg):
-    """
-    执行结束时的简易日志输出
-    """
-    logging.info("-----------------%s 结束-----------------" % msg)
-
-def get_tip_internal(total_num):
-    """
-    计算进度提示间隔
-    """
-    # 尝试从缓存中获取
-    internal = tip_internal_cache.get(total_num)
-    # 不存在则进行计算并放入缓存中
-    if not internal:
-        internal = math.ceil(total_num * config.PRECENT_TIPS)
-        tip_internal_cache[total_num] = internal
-    return internal
-
-
-def tip(total_num, cur_num, is_zero_base=True):
-    """
-    简易进度提示
-
-    total_num 总数量
-
-    cur_num 当前进度(0基)
-
-    internal 提示间隔
-    """
-
-    # TODO
-    # 修改成百分比提示
-
-    internal = get_tip_internal(total_num)
-
-    # cur_num + 1 是0基修正
-    if is_zero_base:
-        cur_num = cur_num + 1
-
-    # 进度提示
-    if cur_num == total_num:
-        logging.info("当前进度 %d / %d" % (total_num, total_num))
-    elif cur_num % internal == 0:
-        logging.info("当前进度 %d / %d" % (cur_num, total_num))
-
-def tip_in_size(total_size, cur_pos):
-    """
-    简易进度提示(用于不知道总行数的情形)
-
-    total_size 总数量
-
-    cur_num 当前进度
-    """
-
-    # 尝试从缓存中获取
-    tip_internal = tip_internal_cache.get(total_size)
-    if not tip_internal:
-        # 不存在缓存,构建 提示检查点 和 提示间隔 信息
-        internal = math.ceil(total_size * config.PRECENT_TIPS)
-        tip_internal= {
-            "check_point": cur_pos,
-            "internal": internal
-        }
-        # 放入缓存
-        tip_internal_cache[total_size] = tip_internal
-
-    # 当前位置超过提示检查点则显示进度
-    if cur_pos >= tip_internal["check_point"]:
-
-        logging.info("当前进度 %d / %d" % (cur_pos, total_size))
-
-        # 修改 提示检查点
-        check_point = tip_internal["check_point"]
-        internal = tip_internal["internal"]
-
-        while cur_pos >= check_point:
-
-            check_point = check_point + internal
-
-            # 如果 提示检查点大于总值,则置为总值
-            if check_point > total_size:
-                check_point = total_size
-                # 如果不手动中断会陷入循环
-                break
-
-        # 更新 提示检查点
-        tip_internal["check_point"] = check_point
-
-def save_obj(path, obj):
-    """
-    保存对象至本地
-    """
-    with open(path, "wb") as f:
-        pickle.dump(obj, f)
-
-def load_obj(path):
-    """
-    加载对象
-    """
-    with open(path, "rb") as f:
-        return pickle.load(f)
-
-def load_stop_word():
-    """
-    加载停用词
-    """
-
-    # 判断是否存在缓存
-    if os.path.exists(config.STOP_WORD_CACHE):
-        logging.debug("存在停用词缓存")
-        return load_obj(config.STOP_WORD_CACHE)
-
-    logging.debug("正在构建停用词缓存")
-
-    # 停用词容器
-    stop_word = set()
-
-    # 构建停用词列表
-    stop_word_files = os.listdir(config.STOP_WORD_DIR)
-    for file in stop_word_files:
-        stop_word_file = os.path.join(config.STOP_WORD_DIR, file)
-        with open(stop_word_file, encoding=config.ENCODING_CHARSET) as f:
-            for item in f:
-                # 移除换行符
-                stop_word.add(item.replace("\n","").replace("\r", ""))
-
-    # 改成dict提升检索速度
-    stop_word_dict = {}
-    for item in stop_word:
-        stop_word_dict[item]=None
-
-    logging.debug("把停用词缓存保存到本地")
-
-    # 保存本地作为缓存
-    save_obj(config.STOP_WORD_CACHE, stop_word_dict)
-
-    return stop_word_dict
-
-
-def avg_split_task(total:int, split_internal:int):
-    """
-    平分任务
-    """
-
-    # 分割的任务份数
-    split_num = math.ceil(total / split_internal)
-
-    # 平分
-    tasks = []
-    for i in range(split_num):
-        # 计算平分点在列表中的位置
-        start_pos = i * split_internal
-        end_pos = i * split_internal + split_internal
-        # 如果超过列表大小需要额外处理
-        if end_pos >= total:
-            end_pos = -1
-        tasks.append([start_pos,end_pos])
-
-    return tasks
-
-if __name__ == "__main__":
-    stop_word = load_stop_word()
-    with open("./data/stopword.txt","w",encoding="UTF-8") as f:
-        for stopWord in stop_word.keys():
-            f.write("%s\n" % stopWord)

+ 1 - 34
src/utils.py

@@ -2,11 +2,9 @@
 import math
 import os
 import pickle
-import re
-import numpy as np
 
 # 停用词存放文件夹
-STOP_WORD_DIR = "./conf/stopwords"
+STOP_WORD_DIR = "./resources/stopwords"
 
 # 临时文件路径
 TEMP_PATH = "../tmp"
@@ -98,37 +96,6 @@ def avg_split_task(total: int, split_internal: int, start=0):
     return tasks
 
 
-def cal_cos_sim(a_word: str, a_stem: list, b_word: str, b_stem: list):
-    """
-    计算余弦相似性
-    :param a_word: A词
-    :param a_stem: A词根列表
-    :param b_word: B词
-    :param b_stem: B词根列表
-    :return: 余弦值
-    """
-    # 合并词根
-    union_stem = list(set(a_stem).union(set(b_stem)))
-
-    # 生成词向量
-    a_vec, b_vec = [], []
-    for word in union_stem:
-        if word in RE_SPECIAL_SYMBOL:
-            word = "\\" + word
-        if word == "c++":
-            word = "c\\+\\+"
-        a_vec.append(len(re.findall(word, a_word)))
-        b_vec.append(len(re.findall(word, b_word)))
-
-    # 计算余弦相关性
-    vec1 = np.array(a_vec)
-    vec2 = np.array(b_vec)
-    val = (np.linalg.norm(vec1) * np.linalg.norm(vec2))
-    if val == 0:
-        return 0
-    return vec1.dot(vec2) / val
-
-
 def remove_line_break(line: str):
     """
     移除换行符