{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "ORIG_FILE = \"./data/范用性关键词-分词结果.csv\"\n",
    "DEST_FILE = \"./data/范用性关键词-分词结果-过滤停用词.csv\"\n",
    "DEST_FILE_FILTER = \"./data/范用性关键词-分词结果-过滤停用词-词频大于300.csv\"\n",
    "STOP_WORD_DIR = \"./data/stopwords\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(ORIG_FILE, names=['key','count'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>怎么</td>\n",
       "      <td>1051516</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>的</td>\n",
       "      <td>123009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>怎么办</td>\n",
       "      <td>93937</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>怎么样</td>\n",
       "      <td>91070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>做</td>\n",
       "      <td>63034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116625</th>\n",
       "      <td>做文</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116626</th>\n",
       "      <td>提微商</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116627</th>\n",
       "      <td>仰卧</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116628</th>\n",
       "      <td>起坐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116629</th>\n",
       "      <td>仰卧起坐</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>116630 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         key    count\n",
       "0         怎么  1051516\n",
       "1          的   123009\n",
       "2        怎么办    93937\n",
       "3        怎么样    91070\n",
       "4          做    63034\n",
       "...      ...      ...\n",
       "116625    做文        1\n",
       "116626   提微商        1\n",
       "116627    仰卧        1\n",
       "116628    起坐        1\n",
       "116629  仰卧起坐        1\n",
       "\n",
       "[116630 rows x 2 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "去重前，停用词数量： 5778\n",
      "去重后，停用词数量： 2462\n"
     ]
    }
   ],
   "source": [
    "# 导入停用词\n",
    "\n",
    "stop_word = [];\n",
    "\n",
    "stop_word_files = os.listdir(STOP_WORD_DIR)\n",
    "for file in stop_word_files:\n",
    "    stop_word_file = os.path.join(STOP_WORD_DIR, file)\n",
    "    with open(stop_word_file) as f:\n",
    "        for item in f:\n",
    "            stop_word.append(item.replace(\"\\n\",\"\"))\n",
    "print(\"去重前，停用词数量：\", len(stop_word))\n",
    "stop_word = list(set(stop_word))\n",
    "print(\"去重后，停用词数量：\", len(stop_word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 过滤停用词\n",
    "df = df[df.apply(lambda row : row['key'] not in stop_word, axis=1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出过滤后的数据，不要表头和行号\n",
    "df.to_csv(DEST_FILE, header=False, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>115534.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>27.613802</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>311.900416</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>6.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>63034.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               count\n",
       "count  115534.000000\n",
       "mean       27.613802\n",
       "std       311.900416\n",
       "min         1.000000\n",
       "25%         1.000000\n",
       "50%         2.000000\n",
       "75%         6.000000\n",
       "max     63034.000000"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "大于1000的数量： 418\n",
      "大于500的数量： 1035\n",
      "大于400的数量： 1340\n",
      "大于300的数量： 1883\n",
      "大于250的数量： 2282\n",
      "大于100的数量： 5104\n"
     ]
    }
   ],
   "source": [
    "print(\"大于1000的数量：\", df[df['count'] > 1000].count().key)\n",
    "print(\"大于500的数量：\", df[df['count'] > 500].count().key)\n",
    "print(\"大于400的数量：\", df[df['count'] > 400].count().key)\n",
    "print(\"大于300的数量：\", df[df['count'] > 300].count().key)\n",
    "print(\"大于250的数量：\", df[df['count'] > 250].count().key)\n",
    "print(\"大于100的数量：\", df[df['count'] > 100].count().key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 过滤词频小于300的数据\n",
    "df=df[df['count']>=300]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出过滤后的数据，不要表头和行号，只要关键词列\n",
    "df.to_csv(DEST_FILE_FILTER, header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[5:7].to_csv(\"./data/多线城-3.csv\", header=False)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "679ecc657d123b537eb7946f00483c298ba68f4074c79757b9e8823d90af42fb"
  },
  "kernelspec": {
   "display_name": "Python 3.9.0 ('jieba')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}