| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- # -*- coding: utf-8 -*-
- import config
- import os
- import tools
- import logging
- import zipfile
- # 待合并的文件目录
- DATA_DIR = "E:\Download\长尾关键词\长尾关键词-什么\普通-p"
- def get_files(path):
- '''
- 读取文件夹下的文件列表
- '''
- file_list = []
- for file in os.listdir(path):
- file_list.append(os.path.join(path,file))
- return file_list
- def merge_file_content():
- """
- 合并文件下的所有文件中的内容(仅限关键词)
- Parameters
- ----------
- dir_path : string
- 待读取的文件夹
- dest_file : string
- 合并后输出的文件
- exclude_file : list
- 跳过压缩文件中的文件
- ----------
- """
- # 获取文件列表
- files = get_files(DATA_DIR)
- # 总文件数
- total_num = len(files)
- logging.info("待处理文件数:%d" % total_num)
- # 排重过滤
- repeat_set = set()
- # 关键词排重前总数
- total_count=0
- # 读取数据并进行排重
- for i, file in enumerate(files):
- zfile = zipfile.ZipFile(file)
- filenames = zfile.namelist()
- for filename in filenames:
- # 重新编码文件名为正确形式
- realname = filename.encode('cp437').decode('gbk')
-
- # 排除无效文件
- if realname in config.MERGE_EXCLUDE_FILES:
- continue
- logging.info("正在处理文件: %s" % realname)
- # 读取压缩文件中的文件
- with zfile.open(filename) as file_content:
- lines = file_content.readlines()
- # 跳过开头两行
- for line in lines[2:]:
- split = line.decode("gbk").split(",")
- # 只需要第一列的数据
- repeat_set.add(split[0])
- # 记录次数
- total_count = total_count + 1
-
- tools.tip(total_num, i)
-
- logging.info("正在保存合并结果,文件位置:%s,排重前数据量:%d,排重后数据量:%d" % (config.MERGE_FILE, total_count, len(repeat_set)))
- with open(config.MERGE_FILE, "w", encoding="utf-8") as f:
- for item in repeat_set:
- f.write(item)
- f.write("\n")
-
- if __name__ == '__main__':
- TITLE= "拓展词合并"
- # 日志初始化
- tools.init_log()
- tools.log_start_msg(TITLE)
- merge_file_content()
- tools.log_end_msg(TITLE)
-
|