feat(read_data): add english data

This commit is contained in:
monoid 2022-02-22 16:33:07 +09:00
parent bb1e0b5c64
commit 883f39d645

View File

@ -1,5 +1,6 @@
import enum import enum
from typing import NamedTuple, List, Sequence, TypeVar from io import TextIOWrapper
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
import json import json
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
@ -25,48 +26,64 @@ class Sentence(NamedTuple):
self.namedEntity.append(namedEntity) self.namedEntity.append(namedEntity)
self.detail.append(detail) self.detail.append(detail)
T = TypeVar('T') T = TypeVar('T')
def readDataList(lst: List[T]): def readDataList(lst: Iterable[str], sep="\t"):
ret = [] ret:List[str] = []
for l in lst: for l in lst:
if len(l) > 0: l = l.strip()
ret.append(l) if l == "":
else:
yield ret yield ret
ret.clear() ret.clear()
def readKoreanData(path: str) -> List[Sentence]:
fp = open(path,encoding="utf-8")
data = []
for line in fp.readlines():
line = line.strip()
if line == "":
data.append([])
else: else:
data.append(line.split("\t")) ret.append(l.split(sep))
fp.close()
# Do not use csv reader.
ret = []
for lines in readDataList(data): def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
ret = []
# NOTE(monoid): Do not use csv reader.
for lines in readDataList(fp):
sentence = Sentence([],[],[],[]) sentence = Sentence([],[],[],[])
for line in lines: for line in lines:
word_pos:str = line[0] word_pos:str = line[0]
words = word_pos.split("/") words = word_pos.split("/")
sentence.append(words[0],line[1],line[2],line[3]) sentence.append(words[0],line[1],line[2],line[3])
ret.append(sentence) ret.append(sentence)
fp.close()
return ret return ret
def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
ret = []
for lines in readDataList(fp,sep=" "):
if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
continue
sentence = Sentence([],[],[],[])
for line in lines:
sentence.append(line[0],line[1],line[2],line[3])
ret.append(sentence)
return ret
def readKoreanDataAll(): def readKoreanDataAll():
""" """
@return train, dev, test tuple
Each entry is structured as follows: Each entry is structured as follows:
POS, POS,
Return: train, dev, test tuple
""" """
dev = readKoreanData(f"{KoreanBase}/dev.txt") with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp:
test = readKoreanData(f"{KoreanBase}/test.txt") dev = readKoreanData(fp)
train = readKoreanData(f"{KoreanBase}/train.txt") with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp:
test = readKoreanData(fp)
with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
train = readKoreanData(fp)
return train, dev, test
def readEnglishDataAll():
with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
print("a")
dev = readEnglishData(fp)
print("b")
with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
test = readEnglishData(fp)
with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
train = readEnglishData(fp)
return train, dev, test return train, dev, test
class TagIdConverter: class TagIdConverter:
@ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c):
break break
return ret return ret
""" """
extracts and stores tags set from the given data. extracts and stores tags set from the given data.
""" """
@ -160,7 +175,7 @@ if __name__ == "__main__":
from tqdm import tqdm from tqdm import tqdm
t = TagIdConverter() t = TagIdConverter()
train, dev, test = readKoreanDataAll() train, dev, test = readEnglishDataAll()
vocab = set() vocab = set()
def getTags(lst: List[Sentence]): def getTags(lst: List[Sentence]):
for s in tqdm(lst): for s in tqdm(lst):
@ -173,7 +188,6 @@ if __name__ == "__main__":
print("get tags from test...") print("get tags from test...")
getTags(test) getTags(test)
print(vocab) print(vocab)
for v in vocab: for v in vocab:
if v == "O": if v == "O":
continue continue
@ -183,7 +197,6 @@ if __name__ == "__main__":
if not v in vocab: if not v in vocab:
print("could not found pair " ,v) print("could not found pair " ,v)
vocab.add(v) vocab.add(v)
tags = [{"name":"[PAD]","index":0}] tags = [{"name":"[PAD]","index":0}]
i = 1 i = 1
vocab_list = [*vocab] vocab_list = [*vocab]
@ -192,5 +205,5 @@ if __name__ == "__main__":
tags.append({"name":v,"index":i}) tags.append({"name":v,"index":i})
i += 1 i += 1
print(tags) print(tags)
with open("tags.json","w",encoding="utf-8") as fp: #with open("tags.json","w",encoding="utf-8") as fp:
json.dump(tags,fp,ensure_ascii=False, indent=2) # json.dump(tags,fp,ensure_ascii=False, indent=2)