3 年前 · fad2665d92
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,8 @@
 
				 __pycache__/
			
 
				 data/tmp/*.pkl
			
 
				 data/tmp/*.txt
			
 
				-data/tmp/*.csv
			
 
				+data/tmp/*.csv
			
 
				+data/pkl/*.pkl
			
 
				+data/*.txt
			
 
				+data/*.csv
			
 
				+临时/
			
--- a/cut.py
+++ b/cut.py
@@ -1,95 +0,0 @@
 
				-# -*- coding:utf-8 -*-
			
 
				-
			
 
				-import os
			
 
				-import csv
			
 
				-import jieba
			
 
				-import pandas as pd
			
 
				-
			
 
				-ENCODING_CHARSET = "UTF-8"
			
 
				-
			
 
				-DATA_KEYWORD_FILE = "E:\Download\长尾关键词\什么长尾词\什么长尾词_1655457938_utf8.csv"
			
 
				-CUT_OUTPUT_FILE = "./cut_out.csv"
			
 
				-
			
 
				-def cutWord(origFile, destFile):
			
 
				-
			
 
				-    print("开始处理")
			
 
				-
			
 
				-    if not os.path.exists(origFile):
			
 
				-        raise Exception("源文件不存在")
			
 
				-
			
 
				-    key_dict = {}
			
 
				-
			
 
				-    print("执行分词操作并进行词频统计")
			
 
				-
			
 
				-    # 分词并统计词频
			
 
				-    with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
			
 
				-        lines = f.readlines()
			
 
				-        for item in lines[2:]:
			
 
				-        # for item in f:
			
 
				-            longTailKey = item.split(",")[0]
			
 
				-            cutWord = jieba.cut_for_search(longTailKey)
			
 
				-            for word in cutWord:
			
 
				-                if word in key_dict:
			
 
				-                    key_dict[word] = key_dict[word] + 1
			
 
				-                else:
			
 
				-                    key_dict[word] = 1
			
 
				-
			
 
				-    print("根据词频进行倒序排列")
			
 
				-
			
 
				-    # 根据词频倒序排列
			
 
				-    sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
			
 
				-
			
 
				-    print("把结果写入到 %s 文件中" % CUT_OUTPUT_FILE)
			
 
				-
			
 
				-    # # 写入到csv文件
			
 
				-    with open(CUT_OUTPUT_FILE, "w", newline='', encoding=ENCODING_CHARSET) as csvFile:
			
 
				-        writer = csv.writer(csvFile)
			
 
				-        for rowItem in sorted_key_dict:
			
 
				-            writer.writerow(rowItem)
			
 
				-    
			
 
				-    print("处理结束")
			
 
				-
			
 
				-ORIG_FILE = "./cut_out.csv"
			
 
				-DEST_FILE = "./cut_out_filter.csv"
			
 
				-STOP_WORD_DIR = "./data/stopwords"
			
 
				-
			
 
				-def filterStopWord() :
			
 
				-    print("导入分词结果")
			
 
				-    df = pd.read_csv(ORIG_FILE, names=['key','count'])
			
 
				-
			
 
				-    # 导入停用词
			
 
				-    print("导入停用词")
			
 
				-
			
 
				-    stop_word = [];
			
 
				-
			
 
				-    stop_word_files = os.listdir(STOP_WORD_DIR)
			
 
				-    for file in stop_word_files:
			
 
				-        stop_word_file = os.path.join(STOP_WORD_DIR, file)
			
 
				-        with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
			
 
				-            for item in f:
			
 
				-                stop_word.append(item.replace("\n",""))
			
 
				-    print("去重前，停用词数量：", len(stop_word))
			
 
				-    stop_word = list(set(stop_word))
			
 
				-    print("去重后，停用词数量：", len(stop_word))
			
 
				-
			
 
				-    # 过滤停用词
			
 
				-    print("过滤停用词")
			
 
				-    print("过滤前，总分词数量：%d" % len(df))
			
 
				-    df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]
			
 
				-    print("过滤后，总分词数量：%d" % len(df))
			
 
				-
			
 
				-    print("大于1000的数量：", df[df['count'] > 1000].count().key)
			
 
				-    print("大于500的数量：", df[df['count'] > 500].count().key)
			
 
				-    print("大于400的数量：", df[df['count'] > 400].count().key)
			
 
				-    print("大于300的数量：", df[df['count'] > 300].count().key)
			
 
				-    print("大于250的数量：", df[df['count'] > 250].count().key)
			
 
				-    print("大于100的数量：", df[df['count'] > 100].count().key)
			
 
				-
			
 
				-    # 导出过滤后的数据，不要表头和行号
			
 
				-    print("导出过滤后的结果")
			
 
				-    df.to_csv(DEST_FILE, header=False, index=False)
			
 
				-
			
 
				-
			
 
				-cutWord(DATA_KEYWORD_FILE, CUT_OUTPUT_FILE)
			
 
				-filterStopWord()
			
 
				-
			
--- a/cut_multiprocess.py
+++ b/cut_multiprocess.py
@@ -1,24 +1,55 @@
 
				 # -*- coding:utf-8 -*-
			
 
				 
			
 
				+import datetime
			
 
				 import os
			
 
				 import math
			
 
				 import pickle
			
 
				 import jieba
			
 
				 from multiprocessing import Process, Manager
			
 
				 
			
 
				+# TODO
			
 
				+# 1. 研究jieba多进程切词(windows上无法使用自带的多进程切词功能)
			
 
				+# 2. 进一步减少断点保存的次数（即调整保存间隔，或者全内存中）
			
 
				+# 3. 加入在分词完成后保存结果以防丢失
			
 
				+# 4. 在每个进程中分别保存分词结果，然后再统一合并
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 待处理的数据文件路径
			
 
				+DATA_FILE = './data/合并结果.txt'
			
 
				+
			
 
				+# 分词保存
			
 
				+CUT_OUTPUT_FILE = './data/分词结果.txt'
			
 
				+
			
 
				+# 是否分词结束后保存结果
			
 
				+IS_ASSUME_TOTAL = True
			
 
				+
			
 
				+# 是否断点续存
			
 
				+IS_ASSUME = False
			
 
				+
			
 
				+# 是否测试模式
			
 
				+IS_TEST_MODE = False
			
 
				+
			
 
				+# 测试使用的数据量
			
 
				+TEST_DATA_NUM = 100 * 10000
			
 
				+
			
 
				+# 测试模式时断点续存的保存间隔
			
 
				+TEST_SAVE_INTERNAL = 200
			
 
				+
			
 
				+# 编码
			
 
				+ENCODING_CHARSET = "UTF-8"
			
 
				+
			
 
				+# 配置文件路径
			
 
				+CONFIG_PATH = "./data/pkl/cut_config_%d.pkl"
			
 
				+
			
 
				 # 处理进程数量
			
 
				-PROCESS_NUM = 5
			
 
				+PROCESS_NUM = os.cpu_count()
			
 
				+
			
 
				 # 保存间隔（多久保存一次）
			
 
				-SAVE_INTERNAL = 100000
			
 
				-# 配置文件路径
			
 
				-CONFIG_PATH = "./cut_config_%d.pkl"
			
 
				-# 待处理的数据文件路径
			
 
				-DATA_FILE = './merge.txt'
			
 
				+SAVE_INTERNAL = TEST_SAVE_INTERNAL if IS_TEST_MODE else 1000000
			
 
				 
			
 
				-# 处理进程容器
			
 
				-process_list = []
			
 
				-# 配置文件容器
			
 
				-config_list = []
			
 
				+# 处理进度提醒间隔
			
 
				+PROCESS_TIPS_INTERNAL = 10 * 10000
			
 
				 
			
 
				 
			
 
				 def save_config(config_path, config_obj):
			
@@ -43,7 +74,7 @@ def cut_word(word):
 
				     word_root = jieba.cut_for_search(word)
			
 
				     return list(word_root)
			
 
				 
			
 
				-def multiprocess_cut_word(process_name, config_path, cut_config):
			
 
				+def multiprocess_cut_word(process_name, data_list, result_dict, config_path, cut_config):
			
 
				 
			
 
				     """
			
 
				     多进程进行分词处理
			
@@ -51,61 +82,127 @@ def multiprocess_cut_word(process_name, config_path, cut_config):
 
				 
			
 
				     print('进程：%s -> 分词处理开始' % process_name)
			
 
				 
			
 
				-    if os.path.exists(config_path) :
			
 
				+    if (IS_ASSUME_TOTAL or IS_ASSUME) and os.path.exists(config_path) :
			
 
				         cut_config = load_config(config_path)
			
 
				         print("进程：%s -> 进断点恢复 当前状态：%s，开始处理位置：%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
			
 
				 
			
 
				     if cut_config['state'] == 'run':
			
 
				-        with open(DATA_FILE, "r", encoding="UTF-8") as f:
			
 
				-            lines = f.readlines()
			
 
				-            lines = lines[cut_config['current_pos']:cut_config['end_pos']]
			
 
				-            print("进程：%s ->剩余待处理数量：%d" % (process_name, len(lines)))
			
 
				-            for i, line in enumerate(lines):
			
 
				-                line = line.replace("\n", "")
			
 
				-                word_root = cut_word(line)
			
 
				-                cut_config["word_dict"][line]=word_root
			
 
				-
			
 
				-                if i > 0 and i % SAVE_INTERNAL == 0:
			
 
				-                    cut_config["current_pos"] = cut_config["current_pos"] + SAVE_INTERNAL
			
 
				-                    print("进程：%s -> 保存位置 当前状态：%s，开始处理位置：%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
			
 
				-                    save_config(config_path, cut_config)
			
 
				+
			
 
				+        # 获取带分词的数据
			
 
				+        lines = data_list[cut_config['current_pos']:cut_config['end_pos']]
			
 
				+
			
 
				+        # 统计需要处理的数据量
			
 
				+        total_num = len(lines)
			
 
				+        print("进程：%s ->剩余待处理数量：%d" % (process_name, total_num))
			
 
				+
			
 
				+        for i, line in enumerate(lines):
			
 
				+            # 数据处理
			
 
				+            line = line.replace("\n", "")
			
 
				+            # 分词
			
 
				+            cut_config["word_dict"][line]=cut_word(line)
			
 
				+
			
 
				+            # 断点保存
			
 
				+            if IS_ASSUME and i > 0 and i % SAVE_INTERNAL == 0:
			
 
				+                cut_config["current_pos"] = cut_config["current_pos"] + SAVE_INTERNAL
			
 
				+                print("进程：%s -> 断点保存 当前状态：%s，当前处理位置：%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
			
 
				+                save_config(config_path, cut_config)
			
 
				             
			
 
				+            # 处理进度提示
			
 
				+            if i > 0 and i % PROCESS_TIPS_INTERNAL == 0:
			
 
				+                print("进程：%s -> 当前处理进度：%d / %d" % (process_name, i, total_num))
			
 
				+
			
 
				+        # 最终结果保存
			
 
				+        if IS_ASSUME_TOTAL or IS_ASSUME:
			
 
				+            print("进程：%s -> 保存最终的分词结果" % process_name)
			
 
				             cut_config["state"] = "end"
			
 
				+            cut_config["current_pos"] = cut_config['end_pos']
			
 
				             save_config(config_path, cut_config)
			
 
				+        
			
 
				+        # result_dict.update(cut_config["word_dict"])
			
 
				+        result_dict[process_name]=cut_config["word_dict"]
			
 
				 
			
 
				-            print('进程：%s -> 分词处理结束' % process_name)
			
 
				+        print('进程：%s -> 分词处理结束' % process_name)
			
 
				     else :
			
 
				-        print('进程：%s -> 断点恢复 分词处理结束' % process_name)
			
 
				+        # result_dict.update(cut_config['word_dict'])
			
 
				+        result_dict[process_name]=cut_config["word_dict"]
			
 
				+        print('进程：%s -> 断点恢复，分词处理结束' % process_name)
			
 
				 
			
 
				 def main():
			
 
				+
			
 
				+    print("开始时间：", datetime.datetime.now())
			
 
				+
			
 
				+    print("配置：启动用断点续存，保存间隔：%d" % SAVE_INTERNAL if IS_ASSUME else "配置：不启用断点续存")
			
 
				+    print("配置：保存最终的分词结果" if IS_ASSUME_TOTAL else "配置：不保存最终的分词结果")
			
 
				+
			
 
				+    # 处理进程容器
			
 
				+    process_list = []
			
 
				+    # 配置文件容器
			
 
				+    config_list = []
			
 
				+
			
 
				+    # 设置多进程共享变量
			
 
				+    manager = Manager()
			
 
				+    # 多进程共享的数据源
			
 
				+    global_list = manager.list()
			
 
				+    # 多进程返回的结果
			
 
				+    result_dict = manager.dict()
			
 
				+
			
 
				+    print("加载数据")
			
 
				     with open(DATA_FILE, "r", encoding="UTF-8") as f:
			
 
				-        lines = f.readlines()
			
 
				-        total_len = len(lines)
			
 
				-        count = math.ceil(total_len / PROCESS_NUM)
			
 
				-        print("总数量：%d, 数量区间：%d" % (total_len, count))
			
 
				-        for i in range(PROCESS_NUM):
			
 
				-            start_pos = i * count
			
 
				-            end_pos = i * count + count
			
 
				-            if end_pos >= total_len :
			
 
				-                end_pos = -1
			
 
				-            cut_config = {
			
 
				-                "state": "run",
			
 
				-                "start_pos": start_pos,
			
 
				-                "current_pos": start_pos,
			
 
				-                "end_pos": end_pos,
			
 
				-                "word_dict": {}
			
 
				-            }
			
 
				-            config_list.append(cut_config)
			
 
				+        if IS_TEST_MODE:
			
 
				+            print("当前处于测试模式，测试数据量：%d" % TEST_DATA_NUM)
			
 
				+            global_list.extend(f.readlines()[:TEST_DATA_NUM])
			
 
				+        else:
			
 
				+            global_list.extend(f.readlines())
			
 
				+    
			
 
				+    total_len = len(global_list)
			
 
				+    count = math.ceil(total_len / PROCESS_NUM)
			
 
				+    print("待处理总数量：%d, 数量区间：%d" % (total_len, count))
			
 
				+
			
 
				+    # 构造配置
			
 
				+    for i in range(PROCESS_NUM):
			
 
				+        start_pos = i * count
			
 
				+        end_pos = i * count + count
			
 
				+        if end_pos >= total_len :
			
 
				+            end_pos = -1
			
 
				+        cut_config = {
			
 
				+            "state": "run",
			
 
				+            "start_pos": start_pos,
			
 
				+            "current_pos": start_pos,
			
 
				+            "end_pos": end_pos,
			
 
				+            "word_dict": {}
			
 
				+        }
			
 
				+        config_list.append(cut_config)
			
 
				 
			
 
				     print("配置", config_list)
			
 
				 
			
 
				     for i, config in enumerate(config_list):
			
 
				-        p = Process(target=multiprocess_cut_word, args=("进程-%d" % i, CONFIG_PATH % i, config))
			
 
				+        p = Process(target=multiprocess_cut_word, args=("进程-%d" % i, global_list, result_dict, CONFIG_PATH % i, config))
			
 
				         p.start()
			
 
				         process_list.append(p)
			
 
				 
			
 
				     for p in process_list:
			
 
				         p.join()
			
 
				 
			
 
				+    print("合并最终的分词结果：开始")
			
 
				+
			
 
				+    result = []
			
 
				+    print("处理成list便于写入文件")
			
 
				+    for (process_name, word_dict) in result_dict.items():
			
 
				+        tmp = None
			
 
				+        for (key, value) in word_dict.items():
			
 
				+            tmp = ["%s,%s\n" % (key, value) for (key, value) in word_dict.items() ]
			
 
				+        result.extend(tmp)
			
 
				+    print("写入文件")
			
 
				+    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				+        f.writelines(result)
			
 
				+    # with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				+    #     for (process_name, word_dict) in result_dict.items():
			
 
				+    #         for (key, value) in word_dict.items():
			
 
				+    #             f.write("%s,%s\n" % (key, value))
			
 
				+    #             # f.write("\n")
			
 
				+    print("合并最终的分词结果：结束")
			
 
				+
			
 
				+    print("结束时间：", datetime.datetime.now())
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     main()
			
--- a/cut_statistics.py
+++ b/cut_statistics.py
@@ -0,0 +1,147 @@
 
				+# -*- coding:utf-8 -*-
			
 
				+
			
 
				+import os
			
 
				+import jieba
			
 
				+
			
 
				+# 待处理的数据文件
			
 
				+DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv"
			
 
				+
			
 
				+# 输出的结果文件
			
 
				+CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv"
			
 
				+
			
 
				+# 文件编码格式
			
 
				+ENCODING_CHARSET = "UTF-8"
			
 
				+
			
 
				+# 停用词
			
 
				+STOP_WORD_DIR = "./data/stopwords"
			
 
				+
			
 
				+# 间隔进度提示
			
 
				+INTERNAL_NUM = 50000
			
 
				+
			
 
				+def cut_word_and_statistics(data):
			
 
				+
			
 
				+    """
			
 
				+    分词并统计词频
			
 
				+    """
			
 
				+
			
 
				+    print("开始执行分词操作并进行词频统计")
			
 
				+
			
 
				+    total_num = len(data)
			
 
				+    print("共需处理数据：%d" % total_num)
			
 
				+
			
 
				+    # 分词结果容器
			
 
				+    key_dict = {}
			
 
				+
			
 
				+    # 跳过开头两行
			
 
				+    for i, item in enumerate(data):
			
 
				+        # 只需要第一列的数据
			
 
				+        longTailKey = item.split(",")[0]
			
 
				+
			
 
				+        longTailKey = longTailKey.replace("\n", "")
			
 
				+        
			
 
				+        # 分词
			
 
				+        cutWord = jieba.cut_for_search(longTailKey)
			
 
				+
			
 
				+        # 统计
			
 
				+        for word in cutWord:
			
 
				+            if word in key_dict:
			
 
				+                key_dict[word] = key_dict[word] + 1
			
 
				+            else:
			
 
				+                key_dict[word] = 1
			
 
				+        
			
 
				+        # 进度提示
			
 
				+        if i % INTERNAL_NUM == 0:
			
 
				+            print("当前分词进度 %d / %d" % (i, total_num))
			
 
				+
			
 
				+    print("根据词频进行倒序排列")
			
 
				+
			
 
				+    # 根据词频倒序排列
			
 
				+    sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
			
 
				+
			
 
				+    print("分词结束")
			
 
				+
			
 
				+    return sorted_key_dict
			
 
				+
			
 
				+
			
 
				+def load_stop_words():
			
 
				+    """
			
 
				+    加载停用词列表
			
 
				+    """
			
 
				+    
			
 
				+    print("加载停用词 - 开始")
			
 
				+
			
 
				+    # 停用词容器
			
 
				+    stop_word = []
			
 
				+
			
 
				+    stop_word_files = os.listdir(STOP_WORD_DIR)
			
 
				+
			
 
				+    for file in stop_word_files:
			
 
				+        stop_word_file = os.path.join(STOP_WORD_DIR, file)
			
 
				+        with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
			
 
				+            for item in f:
			
 
				+                stop_word.append(item.replace("\n",""))
			
 
				+
			
 
				+    print("去重前，停用词数量：", len(stop_word))
			
 
				+
			
 
				+    stop_word = list(set(stop_word))
			
 
				+
			
 
				+    print("去重后，停用词数量：", len(stop_word))
			
 
				+
			
 
				+    print("加载停用词 - 结束")
			
 
				+
			
 
				+    return stop_word
			
 
				+
			
 
				+
			
 
				+def filter_stop_word(word_root: dict) :
			
 
				+    """
			
 
				+    对分词结果过滤停用词
			
 
				+    """
			
 
				+
			
 
				+    print("过滤停用词 - 开始")
			
 
				+
			
 
				+    # 加载停用词
			
 
				+    stop_word = load_stop_words()
			
 
				+    
			
 
				+    print("过滤前，总分词数量：%d" % len(word_root))
			
 
				+
			
 
				+    # 过滤停用词
			
 
				+    word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word)
			
 
				+
			
 
				+    print("过滤后，总分词数量：%d" % len(word_root_filter))
			
 
				+
			
 
				+    print("过滤停用词 - 结束")
			
 
				+
			
 
				+    return word_root_filter
			
 
				+
			
 
				+def main():
			
 
				+
			
 
				+    print("开始")
			
 
				+
			
 
				+    if not os.path.exists(DATA_KEYWORD_FILE):
			
 
				+        raise Exception("待处理的数据文件不存在：%s" % DATA_KEYWORD_FILE)
			
 
				+
			
 
				+    # 读取数据
			
 
				+    print("从待处理的数据文件中读取数据")
			
 
				+    lines = None
			
 
				+    with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
			
 
				+        lines = f.readlines()
			
 
				+    
			
 
				+    # 执行分词和词频统计
			
 
				+    word_root = cut_word_and_statistics(lines[:100])
			
 
				+
			
 
				+    # 过滤停用词
			
 
				+    word_root_filter = filter_stop_word(word_root)
			
 
				+        
			
 
				+    # 导出过滤后的数据，不要表头和行号
			
 
				+    # print("导出过滤后的结果")
			
 
				+    # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False)
			
 
				+    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
			
 
				+        for item in word_root_filter.items():
			
 
				+            f.write("%s,%d\n" % item)
			
 
				+
			
 
				+    print("结束")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
--- a/merge_cut_word.py
+++ b/merge_cut_word.py
@@ -3,9 +3,15 @@
 
				 import os
			
 
				 import zipfile
			
 
				 
			
 
				-EXCLUDE_FILES = ['打开乱码如何处理？.txt']
			
 
				+# 带合并的文件目录
			
 
				 DATA_DIR = "E:\Download\长尾关键词\普通-p"
			
 
				 
			
 
				+# 合并后输出文件
			
 
				+MERGE_OUTPUT_FILE = "./data/合并结果.txt"
			
 
				+
			
 
				+# 排除合并的文件
			
 
				+EXCLUDE_FILES = ['打开乱码如何处理？.txt']
			
 
				+
			
 
				 def get_files(path):
			
 
				     '''
			
 
				     读取文件夹下的文件名称
			
@@ -15,7 +21,7 @@ def get_files(path):
 
				         file_list.append(os.path.join(path,file))
			
 
				     return file_list
			
 
				 
			
 
				-def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE_FILES):
			
 
				+def merge_file_content():
			
 
				     """
			
 
				     合并文件下的所有文件中的内容（仅限关键词）
			
 
				 
			
@@ -35,12 +41,16 @@ def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE
 
				     print("----------- 开始 -----------")
			
 
				 
			
 
				     # 获取文件列表
			
 
				-    print("读取文件列表")
			
 
				-    files = get_files(dir_path)
			
 
				+    
			
 
				+    files = get_files(DATA_DIR)
			
 
				+
			
 
				+    total_num = len(files)
			
 
				+
			
 
				+    print("读取文件列表，待处理文件数：%d" % total_num)
			
 
				 
			
 
				-    with open(dest_file, "w", encoding="utf-8") as f:
			
 
				+    with open(MERGE_OUTPUT_FILE, "w", encoding="utf-8") as f:
			
 
				 
			
 
				-        for file in files:
			
 
				+        for i, file in enumerate(files):
			
 
				             zfile = zipfile.ZipFile(file)
			
 
				             filenames = zfile.namelist()
			
 
				             for filename in filenames:
			
@@ -49,10 +59,10 @@ def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE
 
				                 realname = filename.encode('cp437').decode('gbk')
			
 
				                 
			
 
				                 # 排除无效文件
			
 
				-                if realname in exclude_file:
			
 
				+                if realname in EXCLUDE_FILES:
			
 
				                     continue
			
 
				 
			
 
				-                print("正在处理文件:", realname)
			
 
				+                print("正在处理文件: %s, 当前进度：%d / %d" % (realname, i, total_num))
			
 
				 
			
 
				                 # 读取压缩文件中的文件
			
 
				                 with zfile.open(filename) as file_content:
			
@@ -66,5 +76,6 @@ def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE
 
				 
			
 
				     print("----------- 结束 -----------")
			
 
				 
			
 
				-merge_file_content(DATA_DIR)
			
 
				+if __name__ == '__main__':
			
 
				+    merge_file_content()