diff --git a/Dataset.ipynb b/Dataset.ipynb
new file mode 100644
index 0000000..3715414
--- /dev/null
+++ b/Dataset.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "2b263b84",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'C:\\\\Users\\\\Monoid\\\\anaconda3\\\\envs\\\\nn\\\\python.exe'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "sys.executable"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "610a9887",
+ "metadata": {},
+ "source": [
+ "개발 환경 확인"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5d2b9307",
+ "metadata": {},
+ "source": [
+ "먼저 데이터를 다운로드를 받아야 한다. 다운로드 스크립트(`download.py`)가 있으니 실행하면 된다."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "5203952d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Extracting: 0%| | 0/14740 [00:00, ?files/s]\n",
+ "Extracting: 0%| | 6/14740 [00:00<05:31, 44.44files/s]\n",
+ "Extracting: 0%| | 11/14740 [00:00<06:02, 40.62files/s]\n",
+ "Extracting: 1%|1 | 205/14740 [00:00<00:18, 772.26files/s]\n",
+ "Extracting: 3%|2 | 421/14740 [00:00<00:11, 1264.91files/s]\n",
+ "Extracting: 4%|4 | 633/14740 [00:00<00:09, 1547.44files/s]\n",
+ "Extracting: 6%|5 | 850/14740 [00:00<00:07, 1743.46files/s]\n",
+ "Extracting: 7%|7 | 1063/14740 [00:00<00:07, 1865.46files/s]\n",
+ "Extracting: 9%|8 | 1276/14740 [00:00<00:06, 1947.67files/s]\n",
+ "Extracting: 10%|# | 1489/14740 [00:00<00:06, 1997.60files/s]\n",
+ "Extracting: 12%|#1 | 1711/14740 [00:01<00:06, 2065.32files/s]\n",
+ "Extracting: 13%|#3 | 1944/14740 [00:01<00:05, 2139.08files/s]\n",
+ "Extracting: 15%|#4 | 2160/14740 [00:01<00:06, 1835.43files/s]\n",
+ "Extracting: 16%|#5 | 2352/14740 [00:01<00:06, 1819.01files/s]\n",
+ "Extracting: 17%|#7 | 2565/14740 [00:01<00:06, 1898.90files/s]\n",
+ "Extracting: 19%|#9 | 2803/14740 [00:01<00:05, 2028.14files/s]\n",
+ "Extracting: 21%|## | 3041/14740 [00:01<00:05, 2127.11files/s]\n",
+ "Extracting: 22%|##2 | 3283/14740 [00:01<00:05, 2205.52files/s]\n",
+ "Extracting: 24%|##3 | 3534/14740 [00:01<00:04, 2294.41files/s]\n",
+ "Extracting: 26%|##5 | 3794/14740 [00:02<00:04, 2377.38files/s]\n",
+ "Extracting: 27%|##7 | 4053/14740 [00:02<00:04, 2440.19files/s]\n",
+ "Extracting: 29%|##9 | 4304/14740 [00:02<00:04, 2453.63files/s]\n",
+ "Extracting: 31%|### | 4551/14740 [00:02<00:04, 2415.44files/s]\n",
+ "Extracting: 33%|###2 | 4794/14740 [00:02<00:04, 2405.56files/s]\n",
+ "Extracting: 34%|###4 | 5038/14740 [00:02<00:04, 2408.61files/s]\n",
+ "Extracting: 36%|###5 | 5280/14740 [00:02<00:03, 2376.84files/s]\n",
+ "Extracting: 37%|###7 | 5518/14740 [00:02<00:04, 2191.01files/s]\n",
+ "Extracting: 39%|###9 | 5789/14740 [00:02<00:03, 2335.79files/s]\n",
+ "Extracting: 41%|####1 | 6061/14740 [00:02<00:03, 2438.34files/s]\n",
+ "Extracting: 43%|####2 | 6333/14740 [00:03<00:03, 2519.66files/s]\n",
+ "Extracting: 45%|####4 | 6594/14740 [00:03<00:03, 2538.63files/s]\n",
+ "Extracting: 46%|####6 | 6850/14740 [00:03<00:03, 2479.35files/s]\n",
+ "Extracting: 48%|####8 | 7100/14740 [00:03<00:03, 2422.08files/s]\n",
+ "Extracting: 50%|####9 | 7344/14740 [00:03<00:03, 2372.63files/s]\n",
+ "Extracting: 51%|#####1 | 7583/14740 [00:03<00:03, 2330.83files/s]\n",
+ "Extracting: 53%|#####3 | 7817/14740 [00:03<00:03, 2281.19files/s]\n",
+ "Extracting: 55%|#####4 | 8046/14740 [00:04<00:04, 1437.45files/s]\n",
+ "Extracting: 56%|#####5 | 8229/14740 [00:04<00:04, 1511.52files/s]\n",
+ "Extracting: 57%|#####7 | 8442/14740 [00:04<00:03, 1650.87files/s]\n",
+ "Extracting: 59%|#####8 | 8649/14740 [00:04<00:03, 1752.92files/s]\n",
+ "Extracting: 60%|###### | 8857/14740 [00:04<00:03, 1832.61files/s]\n",
+ "Extracting: 62%|######1 | 9078/14740 [00:04<00:02, 1934.07files/s]\n",
+ "Extracting: 63%|######2 | 9284/14740 [00:04<00:02, 1870.39files/s]\n",
+ "Extracting: 64%|######4 | 9480/14740 [00:04<00:02, 1773.62files/s]\n",
+ "Extracting: 66%|######5 | 9701/14740 [00:04<00:02, 1890.49files/s]\n",
+ "Extracting: 67%|######7 | 9917/14740 [00:04<00:02, 1960.47files/s]\n",
+ "Extracting: 69%|######8 | 10137/14740 [00:05<00:02, 2028.22files/s]\n",
+ "Extracting: 70%|####### | 10365/14740 [00:05<00:02, 2094.65files/s]\n",
+ "Extracting: 72%|#######1 | 10584/14740 [00:05<00:01, 2116.22files/s]\n",
+ "Extracting: 73%|#######3 | 10803/14740 [00:05<00:01, 2131.59files/s]\n",
+ "Extracting: 75%|#######4 | 11019/14740 [00:05<00:01, 2139.94files/s]\n",
+ "Extracting: 76%|#######6 | 11239/14740 [00:05<00:01, 2157.61files/s]\n",
+ "Extracting: 78%|#######7 | 11461/14740 [00:05<00:01, 2169.71files/s]\n",
+ "Extracting: 79%|#######9 | 11679/14740 [00:05<00:01, 2134.66files/s]\n",
+ "Extracting: 81%|######## | 11893/14740 [00:05<00:01, 1815.46files/s]\n",
+ "Extracting: 82%|########2 | 12110/14740 [00:06<00:01, 1908.83files/s]\n",
+ "Extracting: 84%|########3 | 12325/14740 [00:06<00:01, 1974.41files/s]\n",
+ "Extracting: 85%|########5 | 12544/14740 [00:06<00:01, 2029.27files/s]\n",
+ "Extracting: 87%|########6 | 12758/14740 [00:06<00:00, 2060.91files/s]\n",
+ "Extracting: 88%|########8 | 12981/14740 [00:06<00:00, 2109.92files/s]\n",
+ "Extracting: 90%|########9 | 13195/14740 [00:06<00:00, 2106.38files/s]\n",
+ "Extracting: 91%|######### | 13408/14740 [00:06<00:00, 2088.79files/s]\n",
+ "Extracting: 92%|#########2| 13619/14740 [00:06<00:00, 2058.78files/s]\n",
+ "Extracting: 94%|#########3| 13826/14740 [00:06<00:00, 2056.07files/s]\n",
+ "Extracting: 95%|#########5| 14033/14740 [00:06<00:00, 1908.91files/s]\n",
+ "Extracting: 97%|#########6| 14234/14740 [00:07<00:00, 1931.80files/s]\n",
+ "Extracting: 98%|#########7| 14438/14740 [00:07<00:00, 1962.63files/s]\n",
+ "Extracting: 99%|#########9| 14636/14740 [00:07<00:00, 1923.14files/s]\n",
+ "Extracting: 100%|##########| 14740/14740 [00:07<00:00, 1954.34files/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "!python download.py"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a81b5d1e",
+ "metadata": {},
+ "source": [
+ "`README.md` 를 보면\n",
+ "\n",
+ "- Each file is consisted of three columns: `id`, `document`, `label`\n",
+ " - `id`: The review id, provieded by Naver\n",
+ " - `document`: The actual review\n",
+ " - `label`: The sentiment class of the review. (0: negative, 1: positive)\n",
+ " - Columns are delimited with tabs (i.e., `.tsv` format; but the file extension is `.txt` for easy access for novices)\n",
+ "\n",
+ "라고 적혀있다.\n",
+ "\n",
+ "tsv면 csv reader 로 쉽게 읽을 수 있다."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "ac79aced",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "list\n",
+ "OrderedDict([('id', '8112052'), ('document', '어릴때보고 지금다시봐도 재밌어요ㅋㅋ'), ('label', '1')])\n",
+ "OrderedDict([('id', '8132799'), ('document', '디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데. 사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.'), ('label', '1')])\n",
+ "OrderedDict([('id', '4655635'), ('document', '폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.'), ('label', '1')])\n",
+ "OrderedDict([('id', '9251303'), ('document', '와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런게 진짜 영화지'), ('label', '1')])\n",
+ "OrderedDict([('id', '10067386'), ('document', '안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.'), ('label', '1')])\n",
+ "OrderedDict([('id', '2190435'), ('document', '사랑을 해본사람이라면 처음부터 끝까지 웃을수 있는영화'), ('label', '1')])\n",
+ "OrderedDict([('id', '9279041'), ('document', '완전 감동입니다 다시봐도 감동'), ('label', '1')])\n",
+ "OrderedDict([('id', '7865729'), ('document', '개들의 전쟁2 나오나요? 나오면 1빠로 보고 싶음'), ('label', '1')])\n",
+ "OrderedDict([('id', '7477618'), ('document', '굿'), ('label', '1')])\n",
+ "OrderedDict([('id', '9250537'), ('document', '바보가 아니라 병 쉰 인듯'), ('label', '1')])\n",
+ "OrderedDict([('id', '9730759'), ('document', '내 나이와 같은 영화를 지금 본 나는 감동적이다..하지만 훗날 다시보면대사하나하나 그 감정을완벽하게 이해할것만 같다...'), ('label', '1')])\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import csv\n",
+ "\n",
+ "BASE_PATH = \"nsmc/nsmc-master\"\n",
+ "print(\"list\")\n",
+ "i = 0\n",
+ "with open(os.path.join(BASE_PATH,\"ratings.txt\"), \"r\", newline='\\n', encoding=\"utf-8\") as fp:\n",
+ " rd = csv.DictReader(fp,delimiter='\\t')\n",
+ " for data in rd:\n",
+ " print(data)\n",
+ " i += 1\n",
+ " if i > 10:\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "245e9b05",
+ "metadata": {},
+ "source": [
+ "잘 읽히는 것을 볼 수가 있다."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "a9efd391",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from IPython.display import HTML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "36b7c38e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "from io import TextIOWrapper\n",
+ "from typing import List, Union\n",
+ "import os\n",
+ "import csv\n",
+ "from dataclasses import dataclass\n",
+ "import tqdm\n",
+ "@dataclass\n",
+ "class NsmcRawData:\n",
+ " id: int\n",
+ " document: str\n",
+ " label: int\n",
+ "\n",
+ "class NsmcRawDataReader:\n",
+ " def __init__(self, file: Union[str, TextIOWrapper]):\n",
+ " self.fp = file\n",
+ " self.need_close = isinstance(file,str)\n",
+ " if self.need_close:\n",
+ " self.fp = open(file,\"r\",encoding=\"utf-8\",newline='\\n')\n",
+ " self.rd = csv.DictReader(self.fp,delimiter='\\t')\n",
+ "\n",
+ " def __iter__(self):\n",
+ " mapper = lambda data: NsmcRawData(int(data[\"id\"]),data[\"document\"],int(data[\"label\"]))\n",
+ " return iter(map(mapper,self.rd))\n",
+ " \n",
+ " def close(self):\n",
+ " if self.need_close:\n",
+ " self.fp.close()\n",
+ " \n",
+ " def __enter__(self):\n",
+ " return self\n",
+ " def __exit__(self, exc_type, exc_val, exc_tb):\n",
+ " self.close()\n",
+ "\n",
+ "def readNsmcRawData(file: Union[str, TextIOWrapper], use_tqdm = False, total: int = 0) -> List[NsmcRawData]:\n",
+ " dataset = []\n",
+ " with NsmcRawDataReader(file) as dataReader:\n",
+ " if use_tqdm and total > 0:\n",
+ " for d in tqdm.tqdm(dataReader, total=total):\n",
+ " dataset.append(d)\n",
+ " else:\n",
+ " for data in dataReader:\n",
+ " dataset.append(data)\n",
+ " return dataset\n",
+ "\n",
+ "BASE_PATH = \"nsmc/nsmc-master\"\n",
+ "\n",
+ "if __name__ == \"__main__\":\n",
+ " dataset = []\n",
+ " raw = readNsmcRawData(f\"{BASE_PATH}/ratings.txt\", use_tqdm= True, total = 200000)
"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "HTML(''+open('ndata.py',\"r\").read()+\"
\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "feddd692",
+ "metadata": {},
+ "source": [
+ "그래서 다음과 같이 코드를 짰다."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "251c75dd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}