2 年之前 · 68de1b2aa1
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,4 @@ cython_debug/
 
				 tmp/
			
 
				 !src/tmp
			
 
				 data/
			
 
				+*.jar
			
--- a/src/agg.py
+++ b/src/agg.py
@@ -1,6 +1,7 @@
 
				 # -*- coding:utf-8 -*-
			
 
				 import math
			
 
				 import os
			
 
				+import subprocess
			
 
				 import time
			
 
				 import zipfile
			
 
				 from concurrent.futures import ProcessPoolExecutor, as_completed
			
@@ -31,7 +32,9 @@ WORD_AGG_RESULT_TEMP_FILE = "长尾词_聚合结果_临时.txt"
 
				 WORD_AGG_RESULT_FILE = "长尾词_聚合结果.txt"
			
 
				 
			
 
				 # 文件夹：历史聚合数据归档文件夹
			
 
				-AGG_ARCHIVE_DIR = "长尾词聚合分析_%s"
			
 
				+WORD_AGG_DIR = "长尾词聚合分析_%s"
			
 
				+
			
 
				+jieba.setLogLevel(jieba.logging.INFO)
			
 
				 
			
 
				 
			
 
				 def agg_word(path: str):
			
@@ -50,10 +53,11 @@ def agg_word(path: str):
 
				     if os.path.isdir(path):
			
 
				         files = os.listdir(path)
			
 
				         for file in files:
			
 
				+            file_path = os.path.join(path, file)
			
 
				             if file.endswith(COMPRESS_FILE_SUFFIX):
			
 
				-                zip_files.append(file)
			
 
				+                zip_files.append(file_path)
			
 
				             elif file.endswith(WORD_FILE_SUFFIX):
			
 
				-                txt_files.append(file)
			
 
				+                txt_files.append(file_path)
			
 
				     elif path.endswith(COMPRESS_FILE_SUFFIX):
			
 
				         zip_files.append(path)
			
 
				     elif path.endswith(WORD_FILE_SUFFIX):
			
@@ -64,10 +68,8 @@ def agg_word(path: str):
 
				         return
			
 
				 
			
 
				     # 创建分析结果文件夹路径
			
 
				-    data_path = path
			
 
				-    if os.path.isfile(data_path):
			
 
				-        data_path = os.path.join(data_path[0:data_path.rindex("\\") + 1],
			
 
				-                                 AGG_ARCHIVE_DIR % time.strftime('%Y%m%d%H%M%S'))
			
 
				+    data_path = os.path.join(path[0:path.rindex("\\") + 1] if os.path.isfile(path) else path,
			
 
				+                             WORD_AGG_DIR % time.strftime('%Y%m%d%H%M%S'))
			
 
				     os.makedirs(data_path)
			
 
				     print("创建聚合分析结果文件夹，路径：" + data_path)
			
 
				 
			
@@ -77,36 +79,32 @@ def agg_word(path: str):
 
				         start_pos = 1
			
 
				 
			
 
				     chains = [
			
 
				-        ("5118关键词压缩文件提取数据", extract_word_from_5118),
			
 
				-        ("合并长尾词", merge_word),
			
 
				-        ("长尾词分词和建立倒排索引", word_split_and_reverse_index),
			
 
				-        ("调用java聚合处理程序", agg_process),
			
 
				-        ("对聚合后的文件内容进行排序重写", sort_file_content)
			
 
				+        ("5118关键词压缩文件提取数据", extract_word_from_5118, True, zip_files),
			
 
				+        ("合并长尾词", merge_word, True, txt_files),
			
 
				+        ("长尾词分词和建立倒排索引", word_split_and_reverse_index, False),
			
 
				+        ("调用java聚合处理程序", agg_process, False),
			
 
				+        ("对聚合后的文件内容进行排序重写", sort_file_content, False)
			
 
				     ]
			
 
				 
			
 
				-    chains = chains[start_pos:-1]
			
 
				+    chains = chains[start_pos:]
			
 
				     chains_len = len(chains)
			
 
				     for i, chain in enumerate(chains, start=1):
			
 
				         print("步骤(%s/%s)：%s 开始..." % (i, chains_len, chain[0]))
			
 
				-        is_success = chain[1](data_path)
			
 
				+        is_success = chain[1](data_path, chain[3]) if chain[2] else chain[1](data_path)
			
 
				         if not is_success:
			
 
				             print("执行异常结束执行!")
			
 
				             return
			
 
				 
			
 
				+    print("长尾词聚合程序执行完成!")
			
 
				+
			
 
				 
			
 
				-def extract_word_from_5118(data_path: str, **params):
			
 
				+def extract_word_from_5118(data_path: str, zip_files: list):
			
 
				     """
			
 
				     从5118关键词压缩文件中提取数据
			
 
				     :param data_path: 分析结果文件夹路径
			
 
				+    :param zip_files: 待解压缩列表
			
 
				     :return: None
			
 
				     """
			
 
				-
			
 
				-    # 获取压缩文件列表
			
 
				-    zip_files = []
			
 
				-    for file in os.listdir(data_path):
			
 
				-        if file.endswith(COMPRESS_FILE_SUFFIX):
			
 
				-            zip_files.append(os.path.join(data_path, file))
			
 
				-
			
 
				     for i, file in enumerate(zip_files):
			
 
				         z_file = zipfile.ZipFile(file)
			
 
				         filenames = z_file.namelist()
			
@@ -140,26 +138,21 @@ def extract_word_from_5118(data_path: str, **params):
 
				     return True
			
 
				 
			
 
				 
			
 
				-def merge_word(data_path: str, **params):
			
 
				+def merge_word(data_path: str, txt_files: list):
			
 
				     """
			
 
				     合并长尾词（带去重）
			
 
				     :param data_path: 分析结果文件夹路径
			
 
				+    :param txt_files: 待合并文件列表
			
 
				     :return:
			
 
				     """
			
 
				-    # 获取文件列表
			
 
				-    file_list = []
			
 
				-    for file in os.listdir(data_path):
			
 
				-        if file.endswith(WORD_FILE_SUFFIX):
			
 
				-            file_list.append(os.path.join(data_path, file))
			
 
				-
			
 
				     # 长尾词集合容器
			
 
				     word_set = set()
			
 
				 
			
 
				     # 读取数据并排重
			
 
				-    for i, file in enumerate(file_list):
			
 
				+    for i, file in enumerate(txt_files):
			
 
				         with open(file, "r", encoding="utf-8") as f:
			
 
				-            for word in f:
			
 
				-                word_set.add(word.replace("\n", ""))
			
 
				+            for line in f:
			
 
				+                word_set.add(utils.remove_line_break(line))
			
 
				 
			
 
				     # 保存合并结果
			
 
				     with open(os.path.join(data_path, WORD_FILE), "w", encoding="utf-8") as f:
			
@@ -176,7 +169,6 @@ def word_split_and_reverse_index(data_path: str):
 
				     :param data_path: 数据存放路径
			
 
				     :return:
			
 
				     """
			
 
				-
			
 
				     # 判断文件是否存在
			
 
				     file = os.path.join(data_path, WORD_FILE)
			
 
				     if os.path.exists(file) and not os.path.isfile(file):
			
@@ -304,9 +296,11 @@ def agg_process(data_path: str):
 
				     """
			
 
				     调用java聚合处理程序
			
 
				     :param data_path: 分析结果文件夹路径
			
 
				-    :return:
			
 
				+    :return: True-运行正常 False-运行失败
			
 
				     """
			
 
				-    return True
			
 
				+    cmds = ["java", "-jar", "./resources/money-mining-1.0-jar-with-dependencies.jar", "agg", data_path]
			
 
				+    return_code = subprocess.run(cmds).returncode
			
 
				+    return 0 == return_code
			
 
				 
			
 
				 
			
 
				 def sort_file_content(data_path: str):
			
@@ -315,10 +309,9 @@ def sort_file_content(data_path: str):
 
				     :param data_path: 分析结果文件夹路径
			
 
				     :return:
			
 
				     """
			
 
				-
			
 
				     # 构造源文件路径
			
 
				     src_path = os.path.join(data_path, WORD_AGG_RESULT_TEMP_FILE)
			
 
				-    if os.path.exists(src_path) and not os.path.isfile(src_path):
			
 
				+    if not os.path.exists(src_path) or not os.path.isfile(src_path):
			
 
				         print("文件不存在！ " + src_path)
			
 
				         return False
			
 
				 
			
--- a/src/mining.py
+++ b/src/mining.py
@@ -1,7 +1,7 @@
 
				 # -*- coding:utf-8 -*-
			
 
				 import sys
			
 
				 
			
 
				-from agg import agg_word
			
 
				+from agg import agg_word, agg_process
			
 
				 
			
 
				 
			
 
				 def main(args: list):
			
--- a/src/resources/stopwords/baidu_stopwords.txt
+++ b/src/resources/stopwords/baidu_stopwords.txt
--- a/src/resources/stopwords/cn_stopwords.txt
+++ b/src/resources/stopwords/cn_stopwords.txt
--- a/src/resources/stopwords/hit_stopwords.txt
+++ b/src/resources/stopwords/hit_stopwords.txt
--- a/src/resources/stopwords/scu_stopwords.txt
+++ b/src/resources/stopwords/scu_stopwords.txt
--- a/src/resources/stopwords/停用词.txt
+++ b/src/resources/stopwords/停用词.txt
--- a/src/utils.py
+++ b/src/utils.py
@@ -4,7 +4,7 @@ import os
 
				 import pickle
			
 
				 
			
 
				 # 停用词存放文件夹
			
 
				-STOP_WORD_DIR = "./conf/stopwords"
			
 
				+STOP_WORD_DIR = "./resources/stopwords"
			
 
				 
			
 
				 # 临时文件路径
			
 
				 TEMP_PATH = "../tmp"