ChenGanBin %!s(int64=3) %!d(string=hai) anos
pai
achega
fad2665d92
Modificáronse 5 ficheiros con 314 adicións e 150 borrados
  1. 5 1
      .gitignore
  2. 0 95
      cut.py
  3. 142 45
      cut_multiprocess.py
  4. 147 0
      cut_statistics.py
  5. 20 9
      merge_cut_word.py

+ 5 - 1
.gitignore

@@ -1,4 +1,8 @@
 __pycache__/
 data/tmp/*.pkl
 data/tmp/*.txt
-data/tmp/*.csv
+data/tmp/*.csv
+data/pkl/*.pkl
+data/*.txt
+data/*.csv
+临时/

+ 0 - 95
cut.py

@@ -1,95 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import os
-import csv
-import jieba
-import pandas as pd
-
-ENCODING_CHARSET = "UTF-8"
-
-DATA_KEYWORD_FILE = "E:\Download\长尾关键词\什么长尾词\什么长尾词_1655457938_utf8.csv"
-CUT_OUTPUT_FILE = "./cut_out.csv"
-
-def cutWord(origFile, destFile):
-
-    print("开始处理")
-
-    if not os.path.exists(origFile):
-        raise Exception("源文件不存在")
-
-    key_dict = {}
-
-    print("执行分词操作并进行词频统计")
-
-    # 分词并统计词频
-    with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
-        lines = f.readlines()
-        for item in lines[2:]:
-        # for item in f:
-            longTailKey = item.split(",")[0]
-            cutWord = jieba.cut_for_search(longTailKey)
-            for word in cutWord:
-                if word in key_dict:
-                    key_dict[word] = key_dict[word] + 1
-                else:
-                    key_dict[word] = 1
-
-    print("根据词频进行倒序排列")
-
-    # 根据词频倒序排列
-    sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
-
-    print("把结果写入到 %s 文件中" % CUT_OUTPUT_FILE)
-
-    # # 写入到csv文件
-    with open(CUT_OUTPUT_FILE, "w", newline='', encoding=ENCODING_CHARSET) as csvFile:
-        writer = csv.writer(csvFile)
-        for rowItem in sorted_key_dict:
-            writer.writerow(rowItem)
-    
-    print("处理结束")
-
-ORIG_FILE = "./cut_out.csv"
-DEST_FILE = "./cut_out_filter.csv"
-STOP_WORD_DIR = "./data/stopwords"
-
-def filterStopWord() :
-    print("导入分词结果")
-    df = pd.read_csv(ORIG_FILE, names=['key','count'])
-
-    # 导入停用词
-    print("导入停用词")
-
-    stop_word = [];
-
-    stop_word_files = os.listdir(STOP_WORD_DIR)
-    for file in stop_word_files:
-        stop_word_file = os.path.join(STOP_WORD_DIR, file)
-        with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
-            for item in f:
-                stop_word.append(item.replace("\n",""))
-    print("去重前,停用词数量:", len(stop_word))
-    stop_word = list(set(stop_word))
-    print("去重后,停用词数量:", len(stop_word))
-
-    # 过滤停用词
-    print("过滤停用词")
-    print("过滤前,总分词数量:%d" % len(df))
-    df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]
-    print("过滤后,总分词数量:%d" % len(df))
-
-    print("大于1000的数量:", df[df['count'] > 1000].count().key)
-    print("大于500的数量:", df[df['count'] > 500].count().key)
-    print("大于400的数量:", df[df['count'] > 400].count().key)
-    print("大于300的数量:", df[df['count'] > 300].count().key)
-    print("大于250的数量:", df[df['count'] > 250].count().key)
-    print("大于100的数量:", df[df['count'] > 100].count().key)
-
-    # 导出过滤后的数据,不要表头和行号
-    print("导出过滤后的结果")
-    df.to_csv(DEST_FILE, header=False, index=False)
-
-
-cutWord(DATA_KEYWORD_FILE, CUT_OUTPUT_FILE)
-filterStopWord()
-

+ 142 - 45
cut_multiprocess.py

@@ -1,24 +1,55 @@
 # -*- coding:utf-8 -*-
 
+import datetime
 import os
 import math
 import pickle
 import jieba
 from multiprocessing import Process, Manager
 
+# TODO
+# 1. 研究jieba多进程切词(windows上无法使用自带的多进程切词功能)
+# 2. 进一步减少断点保存的次数(即调整保存间隔,或者全内存中)
+# 3. 加入在分词完成后保存结果以防丢失
+# 4. 在每个进程中分别保存分词结果,然后再统一合并
+
+
+
+# 待处理的数据文件路径
+DATA_FILE = './data/合并结果.txt'
+
+# 分词保存
+CUT_OUTPUT_FILE = './data/分词结果.txt'
+
+# 是否分词结束后保存结果
+IS_ASSUME_TOTAL = True
+
+# 是否断点续存
+IS_ASSUME = False
+
+# 是否测试模式
+IS_TEST_MODE = False
+
+# 测试使用的数据量
+TEST_DATA_NUM = 100 * 10000
+
+# 测试模式时断点续存的保存间隔
+TEST_SAVE_INTERNAL = 200
+
+# 编码
+ENCODING_CHARSET = "UTF-8"
+
+# 配置文件路径
+CONFIG_PATH = "./data/pkl/cut_config_%d.pkl"
+
 # 处理进程数量
-PROCESS_NUM = 5
+PROCESS_NUM = os.cpu_count()
+
 # 保存间隔(多久保存一次)
-SAVE_INTERNAL = 100000
-# 配置文件路径
-CONFIG_PATH = "./cut_config_%d.pkl"
-# 待处理的数据文件路径
-DATA_FILE = './merge.txt'
+SAVE_INTERNAL = TEST_SAVE_INTERNAL if IS_TEST_MODE else 1000000
 
-# 处理进程容器
-process_list = []
-# 配置文件容器
-config_list = []
+# 处理进度提醒间隔
+PROCESS_TIPS_INTERNAL = 10 * 10000
 
 
 def save_config(config_path, config_obj):
@@ -43,7 +74,7 @@ def cut_word(word):
     word_root = jieba.cut_for_search(word)
     return list(word_root)
 
-def multiprocess_cut_word(process_name, config_path, cut_config):
+def multiprocess_cut_word(process_name, data_list, result_dict, config_path, cut_config):
 
     """
     多进程进行分词处理
@@ -51,61 +82,127 @@ def multiprocess_cut_word(process_name, config_path, cut_config):
 
     print('进程:%s -> 分词处理开始' % process_name)
 
-    if os.path.exists(config_path) :
+    if (IS_ASSUME_TOTAL or IS_ASSUME) and os.path.exists(config_path) :
         cut_config = load_config(config_path)
         print("进程:%s -> 进断点恢复 当前状态:%s,开始处理位置:%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
 
     if cut_config['state'] == 'run':
-        with open(DATA_FILE, "r", encoding="UTF-8") as f:
-            lines = f.readlines()
-            lines = lines[cut_config['current_pos']:cut_config['end_pos']]
-            print("进程:%s ->剩余待处理数量:%d" % (process_name, len(lines)))
-            for i, line in enumerate(lines):
-                line = line.replace("\n", "")
-                word_root = cut_word(line)
-                cut_config["word_dict"][line]=word_root
-
-                if i > 0 and i % SAVE_INTERNAL == 0:
-                    cut_config["current_pos"] = cut_config["current_pos"] + SAVE_INTERNAL
-                    print("进程:%s -> 保存位置 当前状态:%s,开始处理位置:%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
-                    save_config(config_path, cut_config)
+
+        # 获取带分词的数据
+        lines = data_list[cut_config['current_pos']:cut_config['end_pos']]
+
+        # 统计需要处理的数据量
+        total_num = len(lines)
+        print("进程:%s ->剩余待处理数量:%d" % (process_name, total_num))
+
+        for i, line in enumerate(lines):
+            # 数据处理
+            line = line.replace("\n", "")
+            # 分词
+            cut_config["word_dict"][line]=cut_word(line)
+
+            # 断点保存
+            if IS_ASSUME and i > 0 and i % SAVE_INTERNAL == 0:
+                cut_config["current_pos"] = cut_config["current_pos"] + SAVE_INTERNAL
+                print("进程:%s -> 断点保存 当前状态:%s,当前处理位置:%d" % (process_name, cut_config["state"], cut_config["current_pos"]))
+                save_config(config_path, cut_config)
             
+            # 处理进度提示
+            if i > 0 and i % PROCESS_TIPS_INTERNAL == 0:
+                print("进程:%s -> 当前处理进度:%d / %d" % (process_name, i, total_num))
+
+        # 最终结果保存
+        if IS_ASSUME_TOTAL or IS_ASSUME:
+            print("进程:%s -> 保存最终的分词结果" % process_name)
             cut_config["state"] = "end"
+            cut_config["current_pos"] = cut_config['end_pos']
             save_config(config_path, cut_config)
+        
+        # result_dict.update(cut_config["word_dict"])
+        result_dict[process_name]=cut_config["word_dict"]
 
-            print('进程:%s -> 分词处理结束' % process_name)
+        print('进程:%s -> 分词处理结束' % process_name)
     else :
-        print('进程:%s -> 断点恢复 分词处理结束' % process_name)
+        # result_dict.update(cut_config['word_dict'])
+        result_dict[process_name]=cut_config["word_dict"]
+        print('进程:%s -> 断点恢复,分词处理结束' % process_name)
 
 def main():
+
+    print("开始时间:", datetime.datetime.now())
+
+    print("配置:启动用断点续存,保存间隔:%d" % SAVE_INTERNAL if IS_ASSUME else "配置:不启用断点续存")
+    print("配置:保存最终的分词结果" if IS_ASSUME_TOTAL else "配置:不保存最终的分词结果")
+
+    # 处理进程容器
+    process_list = []
+    # 配置文件容器
+    config_list = []
+
+    # 设置多进程共享变量
+    manager = Manager()
+    # 多进程共享的数据源
+    global_list = manager.list()
+    # 多进程返回的结果
+    result_dict = manager.dict()
+
+    print("加载数据")
     with open(DATA_FILE, "r", encoding="UTF-8") as f:
-        lines = f.readlines()
-        total_len = len(lines)
-        count = math.ceil(total_len / PROCESS_NUM)
-        print("总数量:%d, 数量区间:%d" % (total_len, count))
-        for i in range(PROCESS_NUM):
-            start_pos = i * count
-            end_pos = i * count + count
-            if end_pos >= total_len :
-                end_pos = -1
-            cut_config = {
-                "state": "run",
-                "start_pos": start_pos,
-                "current_pos": start_pos,
-                "end_pos": end_pos,
-                "word_dict": {}
-            }
-            config_list.append(cut_config)
+        if IS_TEST_MODE:
+            print("当前处于测试模式,测试数据量:%d" % TEST_DATA_NUM)
+            global_list.extend(f.readlines()[:TEST_DATA_NUM])
+        else:
+            global_list.extend(f.readlines())
+    
+    total_len = len(global_list)
+    count = math.ceil(total_len / PROCESS_NUM)
+    print("待处理总数量:%d, 数量区间:%d" % (total_len, count))
+
+    # 构造配置
+    for i in range(PROCESS_NUM):
+        start_pos = i * count
+        end_pos = i * count + count
+        if end_pos >= total_len :
+            end_pos = -1
+        cut_config = {
+            "state": "run",
+            "start_pos": start_pos,
+            "current_pos": start_pos,
+            "end_pos": end_pos,
+            "word_dict": {}
+        }
+        config_list.append(cut_config)
 
     print("配置", config_list)
 
     for i, config in enumerate(config_list):
-        p = Process(target=multiprocess_cut_word, args=("进程-%d" % i, CONFIG_PATH % i, config))
+        p = Process(target=multiprocess_cut_word, args=("进程-%d" % i, global_list, result_dict, CONFIG_PATH % i, config))
         p.start()
         process_list.append(p)
 
     for p in process_list:
         p.join()
 
+    print("合并最终的分词结果:开始")
+
+    result = []
+    print("处理成list便于写入文件")
+    for (process_name, word_dict) in result_dict.items():
+        tmp = None
+        for (key, value) in word_dict.items():
+            tmp = ["%s,%s\n" % (key, value) for (key, value) in word_dict.items() ]
+        result.extend(tmp)
+    print("写入文件")
+    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
+        f.writelines(result)
+    # with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
+    #     for (process_name, word_dict) in result_dict.items():
+    #         for (key, value) in word_dict.items():
+    #             f.write("%s,%s\n" % (key, value))
+    #             # f.write("\n")
+    print("合并最终的分词结果:结束")
+
+    print("结束时间:", datetime.datetime.now())
+
 if __name__ == '__main__':
     main()

+ 147 - 0
cut_statistics.py

@@ -0,0 +1,147 @@
+# -*- coding:utf-8 -*-
+
+import os
+import jieba
+
+# 待处理的数据文件
+DATA_KEYWORD_FILE = "E:\Download\怎么长尾词_1655561719.csv"
+
+# 输出的结果文件
+CUT_OUTPUT_FILE = "./data/分词与词频统计结果.csv"
+
+# 文件编码格式
+ENCODING_CHARSET = "UTF-8"
+
+# 停用词
+STOP_WORD_DIR = "./data/stopwords"
+
+# 间隔进度提示
+INTERNAL_NUM = 50000
+
+def cut_word_and_statistics(data):
+
+    """
+    分词并统计词频
+    """
+
+    print("开始执行分词操作并进行词频统计")
+
+    total_num = len(data)
+    print("共需处理数据:%d" % total_num)
+
+    # 分词结果容器
+    key_dict = {}
+
+    # 跳过开头两行
+    for i, item in enumerate(data):
+        # 只需要第一列的数据
+        longTailKey = item.split(",")[0]
+
+        longTailKey = longTailKey.replace("\n", "")
+        
+        # 分词
+        cutWord = jieba.cut_for_search(longTailKey)
+
+        # 统计
+        for word in cutWord:
+            if word in key_dict:
+                key_dict[word] = key_dict[word] + 1
+            else:
+                key_dict[word] = 1
+        
+        # 进度提示
+        if i % INTERNAL_NUM == 0:
+            print("当前分词进度 %d / %d" % (i, total_num))
+
+    print("根据词频进行倒序排列")
+
+    # 根据词频倒序排列
+    sorted_key_dict = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)
+
+    print("分词结束")
+
+    return sorted_key_dict
+
+
+def load_stop_words():
+    """
+    加载停用词列表
+    """
+    
+    print("加载停用词 - 开始")
+
+    # 停用词容器
+    stop_word = []
+
+    stop_word_files = os.listdir(STOP_WORD_DIR)
+
+    for file in stop_word_files:
+        stop_word_file = os.path.join(STOP_WORD_DIR, file)
+        with open(stop_word_file, encoding=ENCODING_CHARSET) as f:
+            for item in f:
+                stop_word.append(item.replace("\n",""))
+
+    print("去重前,停用词数量:", len(stop_word))
+
+    stop_word = list(set(stop_word))
+
+    print("去重后,停用词数量:", len(stop_word))
+
+    print("加载停用词 - 结束")
+
+    return stop_word
+
+
+def filter_stop_word(word_root: dict) :
+    """
+    对分词结果过滤停用词
+    """
+
+    print("过滤停用词 - 开始")
+
+    # 加载停用词
+    stop_word = load_stop_words()
+    
+    print("过滤前,总分词数量:%d" % len(word_root))
+
+    # 过滤停用词
+    word_root_filter = dict((key, value) for key , value in word_root if key not in stop_word)
+
+    print("过滤后,总分词数量:%d" % len(word_root_filter))
+
+    print("过滤停用词 - 结束")
+
+    return word_root_filter
+
+def main():
+
+    print("开始")
+
+    if not os.path.exists(DATA_KEYWORD_FILE):
+        raise Exception("待处理的数据文件不存在:%s" % DATA_KEYWORD_FILE)
+
+    # 读取数据
+    print("从待处理的数据文件中读取数据")
+    lines = None
+    with open(DATA_KEYWORD_FILE, "r", encoding=ENCODING_CHARSET) as f:
+        lines = f.readlines()
+    
+    # 执行分词和词频统计
+    word_root = cut_word_and_statistics(lines[:100])
+
+    # 过滤停用词
+    word_root_filter = filter_stop_word(word_root)
+        
+    # 导出过滤后的数据,不要表头和行号
+    # print("导出过滤后的结果")
+    # word_root_dataframe.to_csv(CUT_OUTPUT_FILE, header=False, index=False)
+    with open(CUT_OUTPUT_FILE, "w", encoding=ENCODING_CHARSET) as f:
+        for item in word_root_filter.items():
+            f.write("%s,%d\n" % item)
+
+    print("结束")
+
+
+if __name__ == '__main__':
+    main()
+

+ 20 - 9
merge.py → merge_cut_word.py

@@ -3,9 +3,15 @@
 import os
 import zipfile
 
-EXCLUDE_FILES = ['打开乱码如何处理?.txt']
+# 带合并的文件目录
 DATA_DIR = "E:\Download\长尾关键词\普通-p"
 
+# 合并后输出文件
+MERGE_OUTPUT_FILE = "./data/合并结果.txt"
+
+# 排除合并的文件
+EXCLUDE_FILES = ['打开乱码如何处理?.txt']
+
 def get_files(path):
     '''
     读取文件夹下的文件名称
@@ -15,7 +21,7 @@ def get_files(path):
         file_list.append(os.path.join(path,file))
     return file_list
 
-def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE_FILES):
+def merge_file_content():
     """
     合并文件下的所有文件中的内容(仅限关键词)
 
@@ -35,12 +41,16 @@ def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE
     print("----------- 开始 -----------")
 
     # 获取文件列表
-    print("读取文件列表")
-    files = get_files(dir_path)
+    
+    files = get_files(DATA_DIR)
+
+    total_num = len(files)
+
+    print("读取文件列表,待处理文件数:%d" % total_num)
 
-    with open(dest_file, "w", encoding="utf-8") as f:
+    with open(MERGE_OUTPUT_FILE, "w", encoding="utf-8") as f:
 
-        for file in files:
+        for i, file in enumerate(files):
             zfile = zipfile.ZipFile(file)
             filenames = zfile.namelist()
             for filename in filenames:
@@ -49,10 +59,10 @@ def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE
                 realname = filename.encode('cp437').decode('gbk')
                 
                 # 排除无效文件
-                if realname in exclude_file:
+                if realname in EXCLUDE_FILES:
                     continue
 
-                print("正在处理文件:", realname)
+                print("正在处理文件: %s, 当前进度:%d / %d" % (realname, i, total_num))
 
                 # 读取压缩文件中的文件
                 with zfile.open(filename) as file_content:
@@ -66,5 +76,6 @@ def merge_file_content(dir_path, dest_file = './merge.txt', exclude_file=EXCLUDE
 
     print("----------- 结束 -----------")
 
-merge_file_content(DATA_DIR)
+if __name__ == '__main__':
+    merge_file_content()