# -*- coding:utf-8 -*-

import config
import os
import tools
import jieba
import logging
import logging.config

from stop_word import load_stop_word

TITLE = "分词处理"

# 待处理的数据文件
DATA_FILE = "E:\Download\怎么长尾词_1655561719.csv"

def cut_word_and_statistics(data):

    """
    分词并统计词频
    """

    logging.info("开始执行分词操作并进行词频统计")

    # 分词结果容器
    key_dict = {}
    # 停用词
    stop_word = load_stop_word()
    # 待处理数据总数量
    total_num = len(data)

    logging.info("共需处理 %d 条数据" % total_num)

    for i, item in enumerate(data):
        # 只需要第一列的数据
        longTailKey = item.split(",")[0]
        # 移除换行符
        longTailKey = longTailKey.replace("\n", "")
        # 分词
        cutWord = jieba.cut_for_search(longTailKey)

        # 统计
        for word in cutWord:

            # 过滤停用词
            if word in stop_word:
                continue

            if word in key_dict:
                key_dict[word] = key_dict[word] + 1
            else:
                key_dict[word] = 1
        
        # 进度提示
        tools.tip(total_num, i)
        

    # 根据词频倒序排列
    logging.info("根据词频进行倒序排列")
    sorted_key_list = sorted(key_dict.items(), key=lambda x: x[1], reverse=True)

    logging.info("分词操作并进行词频统计 结束")

    return sorted_key_list

def main():

    # 日志初始化
    tools.init_log()

    tools.log_start_msg(TITLE)

    if not os.path.exists(DATA_FILE):
        logging.warning("待处理的数据文件不存在：%s" % DATA_FILE)
        return

    # 读取数据
    logging.info("正在读取待处理的数据文件：%s" % DATA_FILE)
    lines = None
    with open(DATA_FILE, "r", encoding=config.ENCODING_CHARSET) as f:
        lines = f.readlines()
    
    # 执行分词和词频统计（跳过前两行）
    word_root_list = cut_word_and_statistics(lines[2:])

    # 导出数据
    logging.info("正在导出分词数据，位置：%s" % config.CUT_FILE)
    with open(config.CUT_FILE, "w", encoding=config.ENCODING_CHARSET) as f:
        for key, count in word_root_list:
            f.write("%s,%d\n" % (key, count))

    tools.log_end_msg(TITLE)


if __name__ == '__main__':
    main()