Jelajahi Sumber

整理代码

ChenGanBin 3 tahun lalu
induk
melakukan
d95937f982
10 mengubah file dengan 238 tambahan dan 111 penghapusan
  1. 1 12
      .gitignore
  2. 2 3
      agg_word.py
  3. 0 73
      cal.py
  4. 0 1
      config.py
  5. 1 3
      cut.py
  6. 6 2
      key_index.py
  7. 1 1
      key_reverse.py
  8. 8 3
      key_reverse_index.py
  9. 218 0
      key_reverse_statistics.py
  10. 1 13
      statistics.py

+ 1 - 12
.gitignore

@@ -1,13 +1,2 @@
 __pycache__/
-data/tmp/*.pkl
-data/tmp/*.txt
-data/tmp/*.csv
-data/pkl/*.pkl
-data/*.txt
-data/*.csv
-临时/
-data_bak/
-src_bak/
-data/analyse/
-data/analyse_bak/
-data/cache/
+data/

+ 2 - 3
agg_word.py

@@ -6,12 +6,11 @@ from itertools import combinations
 import math
 import mmap
 import os
-from time import sleep, time
+from time import time
 from cal import cal_cos_sim
 
 import config
 import tools
-import stop_word
 import re
 import logging
 
@@ -36,7 +35,7 @@ def intesect(x, y):
 
 if __name__ != "__main__":
     # 停用词
-    stop_word_index = stop_word.load_stop_word()
+    stop_word_index = tools.load_stop_word()
 
     # KEY表索引
     key_index = tools.load_obj(config.KEY_INDEX_CACHE)

+ 0 - 73
cal.py

@@ -41,76 +41,3 @@ def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list):
     val = col_sim(np.array(a_vec), np.array(b_vec))
     return val
 
-
-if __name__ == "__main__":
-
-    # a_word= "QQ邮箱格式怎么写"
-    # b_word= "QQ邮箱格式如何写"
-    # a_word_root = ['QQ', '邮箱', '格式', '怎么', '写']
-    # b_word_root = ['QQ', '邮箱', '格式', '如何', '写']
-    # print(cal_cos_sim(a_word, a_word_root, b_word, b_word_root))
-
-     # 合并词根,用于生成词向量
-    # union_word_root = merge_stem(a_word_root, b_word_root)
-    # print(union_word_root)
-
-    # # 生成词向量
-    # a_vec, b_vec = gen_word_vec(a_word, b_word, union_word_root)
-    # print(a_vec)
-    # print(b_vec)
-    # # a_vec = [1,1,1,1,0,1]
-    # # b_vec = [1,1,1,0,1,1]
-    # print(col_sim(np.array(a_vec), np.array(b_vec)))
-
-    # s = "0,腋下长了一个小疙瘩是什么东西,['腋下', '长', '了', '一个', '小', '疙瘩', '是', '什么', '东西']"
-    # s_r = r"'([^,]*)'"
-    # pattern = re.compile(s_r, re.I)
-    # for i in pattern.findall(s):
-    #     print(i)
-    
-    # s_r = r"([\d]*),(.*),\["
-    # pattern = re.compile(s_r, re.I)
-    # m = pattern.match(s)
-    # for i in m.groups():
-    #     print(i)
-
-    # import mmap
-    # with open(config.KEY_FILE, "r", encoding=config.ENCODING_CHARSET) as f_key, \
-    #     mmap.mmap(f_key.fileno(), 0, access=mmap.ACCESS_READ) as f_key_mmap:
-
-    #     key_info_re = r"([\d]*),(.*),\["
-    #     key_info_pattern = re.compile(key_info_re, re.I)
-
-    #     s_r = r"'([^,]*)'"
-    #     s_pattern = re.compile(s_r, re.I)
-
-    #     a_line = f_key_mmap.readline().decode("UTF-8")
-    #     b_line = f_key_mmap.readline().decode("UTF-8")
-
-    #     a_m = key_info_pattern.match(a_line)
-    #     a_key = a_m.group(2)
-    #     a_stem = s_pattern.findall(a_line)
-    #     print(a_stem)
-
-    #     b_m = key_info_pattern.match(b_line)
-    #     b_key = b_m.group(2)
-    #     b_stem = s_pattern.findall(b_line)
-    #     print(b_stem)
-
-    #     print(cal_cos_sim(a_key, a_stem, b_key, b_stem))
-
-    # a_key = "吃什么东西减肥最快"
-    # a_stem = ['吃', '什么', '东西', '减肥', '最快']
-
-    # b_key="vc++读写什么文件最快"
-    # b_stem =['v', 'c++', '读写', '什么', '文件', '最快']
-    # print(cal_cos_sim(a_key, a_stem, b_key, b_stem))
-
-    # print(re.findall("c\\+\\+", "vc++读写什么文件最快"))
-    
-    
-    print("".join([".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]))
-    # s = r"([.?^$*+\[]|{}()])"
-    s=r"([\\])"
-    re.findall(s, "vc++读写什么文件最快")
-    print(re.findall(s, "vc++读写什么文件\最快"))

+ 0 - 1
config.py

@@ -52,7 +52,6 @@ ANALYSE_BITMAP_CACHE = "./data/cache/analyse_bitmap.pkl"
 ANALYSE_PROCESS_CACHE = "./data/cache/analyse_process.pkl"
 
 # 正则表达式中需要额外处理的特殊符号
-# RE_SPECIAL_SIMBOL = "'.', '?', '^', '$', '*', '+', '\\', '[', ']', '|', '{', '}', '(', ')"
 RE_SPECIAL_SIMBOL = [".", "?", "^", "$", "*", "+", "\\", "[", "]", "|", "{", "}", "(", ")"]
 
 # 百分比进度提示

+ 1 - 3
cut.py

@@ -7,8 +7,6 @@ import jieba
 import logging
 import logging.config
 
-from stop_word import load_stop_word
-
 
 # 待处理的数据文件
 INPUT_FILE = "E:\Download\怎么长尾词_1655561719.csv"
@@ -27,7 +25,7 @@ def cut_word_and_statistics(data):
     # 分词结果容器
     key_dict = {}
     # 停用词
-    stop_word = load_stop_word()
+    stop_word = tools.load_stop_word()
     # 待处理数据总数量
     total_num = len(data)
 

+ 6 - 2
key_index.py

@@ -31,8 +31,12 @@ def main():
             # 进度显示
             tools.tip_in_size(total_num, cur_pos)
         
-        # 保存索引
-        tools.save_obj(config.KEY_INDEX_CACHE, key_index)
+    with open("./data/tmp/key_index_test.csv", "w", encoding=config.ENCODING_CHARSET) as f:
+        f.write(",".join([str(i) for i in key_index]))
+
+        
+    # 保存索引
+    # tools.save_obj(config.KEY_INDEX_CACHE, key_index)
 
 
 if __name__ == '__main__':

+ 1 - 1
key_reverse.py

@@ -138,7 +138,7 @@ def main_process():
 
     # 保存到本地文件
     logging.info("主进程 保存到本地")
-    with open(config.KEY_REVERSE_FILE, "w", encoding=config.ENCODING_CHARSET) as f_reverse, \
+    with open("./data/tmp/reverse_test.csv", "w", encoding=config.ENCODING_CHARSET) as f_reverse, \
         open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as f_statistics:
         for key, val_obj in sorted_reverse_list:
             f_reverse.write("%s,%s\n" % (key, val_obj["indexs"]))

+ 8 - 3
key_reverse_index.py

@@ -12,7 +12,7 @@ def main():
     tools.log_start_msg(TITLE)
 
     # 关键词倒排索引容器
-    key_reverse_index_cache = {}
+    reverse_index = []
 
     with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as freverse, \
         mmap.mmap(freverse.fileno(), 0, access=mmap.ACCESS_READ) as fmmap:
@@ -32,13 +32,18 @@ def main():
             
             # 获取词根位置,建立词根和位置的关系
             index = line.index(",")
-            key_reverse_index_cache[line[:index]]=cur_pos
+            key = line[:index]
+            next_pos = fmmap.tell()
+            reverse_index.append((key, cur_pos, next_pos))
             
             # 进度显示
             tools.tip_in_size(total_num, cur_pos)
         
         # 保存索引
-        tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
+        with open("./data/tmp/reverse_index_test.csv", "w", encoding=config.ENCODING_CHARSET) as f:
+            for key, cur_pos, next_pos in reverse_index:
+                f.write("%s,%d,%d\n" % (key, cur_pos, next_pos))
+        # tools.save_obj(config.KEY_REVERSE_INDEX_CACHE, key_reverse_index_cache)
 
     tools.log_end_msg(TITLE)
 

+ 218 - 0
key_reverse_statistics.py

@@ -0,0 +1,218 @@
+# -*- coding:utf-8 -*-
+
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import mmap
+from multiprocessing.connection import wait
+import random
+import sys
+from time import sleep, time
+import os
+import config
+import tools
+import ast
+import re
+import stop_word
+import logging
+import math
+from multiprocessing import Process, Pool
+
+TITLE = "关键词倒排文件 统计"
+
+# def reverse_statistics(start_pos, end_pos):
+
+def handle(start_pos, end_pos):
+
+    print("进程:%d, 统计开始,开始位置:%d,结束位置:%d" % (os.getpid(), start_pos, end_pos))
+
+    # 统计信息容器
+    reverse_statistics = {}
+    
+    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
+        mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
+        # 调整开始位置
+        fmmap.seek(start_pos)
+
+        while True:
+            cur_pos = fmmap.tell()
+            # 越界检测
+            if cur_pos >= end_pos:
+                break
+            
+            line = fmmap.readline().decode(config.ENCODING_CHARSET)
+            index=line.index(",")
+            key = line[:index]
+            word_root = line[index+1:]
+            word_root = ast.literal_eval(word_root)
+            l = len(word_root)
+            
+            reverse_statistics[key]=l
+            
+    logging.info("进程:%d, 统计结束" % os.getpid())
+
+    return {
+        "pid":os.getpid(),
+        "statistics":reverse_statistics
+    }
+    
+
+def main2():
+    # 日志信息配置
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+
+    # 进程数
+    process_num = os.cpu_count()
+
+    # 加载缓存索引文件
+    key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
+
+    # 对索引文件中的元素进行平分
+
+    # 转成列表,计算总长 和 平分后的处理区间
+    key_list = [key for key in key_reverse_index.keys()]
+    key_list_len = len(key_list)
+    internal = math.ceil(key_list_len / process_num )
+
+    # 利用 缓存索引文件 生成处理区间的位置信息
+    # 位置信息容器
+    pos_list = []
+    for i in range(process_num + 1):
+        # 计算平分点在列表中的位置
+        l_pos = i * internal
+        # 如果超过列表大小需要额外处理
+        if l_pos > key_list_len:
+            l_pos = key_list_len -1
+        # 获取列表中的词根
+        key = key_list[l_pos:l_pos+1]
+        # 根据词根获取位置信息
+        pos = key_reverse_index[key[0]]
+        # 记录位置信息
+        pos_list.append(pos)
+
+     
+    # 使用用进程池
+    pool = ProcessPoolExecutor(process_num)
+    # 生成任务
+    process_futures = []
+    for i in range(0, len(pos_list)-1):
+        pos = pos_list[i: i+2]
+        process_futures.append(pool.submit(handle, pos[0], pos[1]))
+    
+    # with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
+    #     for future in as_completed(process_futures):
+    #         logging.info("部分子任务统计结束,保存至本地 - 开始")
+    #         for key, value in future.result().items():
+    #             fw.write("%s,%s\n"%(key,value))
+    #         logging.info("部分子任务统计结束,保存至本地 - 结束")
+
+    
+    results = []
+    for future in as_completed(process_futures):
+        result = future.result()
+        logging.info("进程:%d, 统计结束" % result["pid"])
+        results.append(result)
+
+    logging.info("统计结束,保存至本地 - 开始")
+    with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
+        for r in results:
+            for key, value in r["statistics"].items():
+                fw.write("%s,%s\n"%(key,value))
+    logging.info("部分子任务统计结束,保存至本地 - 结束")
+
+    pool.shutdown(wait=True)
+
+    tools.log_end_msg(TITLE)
+
+    # 测试代码3
+    # pool = ProcessPoolExecutor(3)
+    # for i in range(1,5):
+    #     pool.submit(handle, "测试进程-%d"%i, i, i*10)
+
+    # pool.shutdown(wait=True)
+
+    # 测试代码2
+    # pool = Pool(3)
+    # for i in range(1,5):
+    #     pool.apply_async(handle, ("测试进程-%d"%i, i, i*10))
+    # pool.close()
+    # pool.join()
+    # print("结束")
+
+    # 测试代码1
+    # p = Process(target=handle, args=('测试进程', 1, 10))
+    # p.start()
+    # p.join()
+    
+    # tools.init_log()
+    # tools.log_start_msg(TITLE)
+
+    # key_reverse_index = tools.load_obj(config.KEY_REVERSE_INDEX_CACHE)
+
+    # tmp = [key for key in key_reverse_index.keys()]
+    
+    # l = len(tmp)
+    # print("总长:", l)
+    # internal = math.ceil(l / 4)
+    # print("间隔:", internal)
+    # pos = []
+    # for i in range(5):
+    #     t = i*internal
+    #     if t > l:
+    #         t = l-1
+    #     pos.append(t)
+    # print(pos)
+
+    # for item in pos:
+    #     key = tmp[item:item+1]
+    #     print(key)
+    #     pos = key_reverse_index[key[0]]
+    #     print(key, pos)
+
+    
+    # reverse_statistics = {}
+    # logging.info("统计开始")
+    # with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
+    #     mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
+    #     for line in fr:
+    #         index=line.index(",")
+    #         key = line[:index]
+    #         word_root = line[index+1:]
+    #         word_root = ast.literal_eval(word_root)
+    #         l = len(word_root)
+            
+    #         reverse_statistics[key]=l
+
+    # logging.info("统计结束,保存至本地")
+    # with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
+    #     for key, value in reverse_statistics:
+    #         fw.write("%s,%s\n"%(key,value))
+
+    # tools.log_end_msg(TITLE)
+
+
+def main():
+    tools.init_log()
+    tools.log_start_msg(TITLE)
+    
+    reverse_statistics = {}
+    logging.info("统计开始")
+    with open(config.KEY_REVERSE_FILE, "r", encoding=config.ENCODING_CHARSET) as fr, \
+        mmap.mmap(fr.fileno(), 0 , access=mmap.ACCESS_READ) as fmmap:
+        for line in fr:
+            index=line.index(",")
+            key = line[:index]
+            word_root = line[index+1:]
+            word_root = ast.literal_eval(word_root)
+            l = len(word_root)
+            
+            reverse_statistics[key]=l
+
+    logging.info("统计结束,保存至本地")
+    with open(config.KEY_REVERSE_STATISTICS_FILE, "w", encoding=config.ENCODING_CHARSET) as fw:
+        for key, value in reverse_statistics:
+            fw.write("%s,%s\n"%(key,value))
+
+    tools.log_end_msg(TITLE)
+
+if __name__ == "__main__":
+    main2()

+ 1 - 13
statistics.py

@@ -131,11 +131,6 @@ def memory_statistics():
 
 def main():
 
-    # num = 459789
-    # print(num%10000)
-    # print(num//10000)
-    # return
-
     tools.init_log()
     tools.log_start_msg(TASK_TITLE)
 
@@ -146,11 +141,4 @@ def main():
 
 
 if __name__ == "__main__":
-    # print("加载开始")
-    # cache = tools.load_obj(config.KEY_REVERSE_INDEX_HOT_CACHE+".bak")
-    # print("加载结束")
-    # time.sleep(20)
-
-    Shape = namedtuple('Shape', ['x', 'y', 'z'])
-    exm = Shape(1, 2, 3)
-    print(exm.index(2))
+    main()