feat(read_data): add english data
This commit is contained in:
parent
bb1e0b5c64
commit
883f39d645
73
read_data.py
73
read_data.py
@ -1,5 +1,6 @@
|
||||
import enum
|
||||
from typing import NamedTuple, List, Sequence, TypeVar
|
||||
from io import TextIOWrapper
|
||||
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
|
||||
import json
|
||||
|
||||
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
||||
@ -25,29 +26,20 @@ class Sentence(NamedTuple):
|
||||
self.namedEntity.append(namedEntity)
|
||||
self.detail.append(detail)
|
||||
T = TypeVar('T')
|
||||
def readDataList(lst: List[T]):
|
||||
ret = []
|
||||
def readDataList(lst: Iterable[str], sep="\t"):
|
||||
ret:List[str] = []
|
||||
for l in lst:
|
||||
if len(l) > 0:
|
||||
ret.append(l)
|
||||
else:
|
||||
l = l.strip()
|
||||
if l == "":
|
||||
yield ret
|
||||
ret.clear()
|
||||
|
||||
def readKoreanData(path: str) -> List[Sentence]:
|
||||
fp = open(path,encoding="utf-8")
|
||||
data = []
|
||||
for line in fp.readlines():
|
||||
line = line.strip()
|
||||
if line == "":
|
||||
data.append([])
|
||||
else:
|
||||
data.append(line.split("\t"))
|
||||
fp.close()
|
||||
# Do not use csv reader.
|
||||
ret = []
|
||||
ret.append(l.split(sep))
|
||||
|
||||
for lines in readDataList(data):
|
||||
def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
|
||||
ret = []
|
||||
# NOTE(monoid): Do not use csv reader.
|
||||
for lines in readDataList(fp):
|
||||
sentence = Sentence([],[],[],[])
|
||||
for line in lines:
|
||||
word_pos:str = line[0]
|
||||
@ -55,18 +47,43 @@ def readKoreanData(path: str) -> List[Sentence]:
|
||||
sentence.append(words[0],line[1],line[2],line[3])
|
||||
ret.append(sentence)
|
||||
|
||||
fp.close()
|
||||
return ret
|
||||
|
||||
def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
|
||||
ret = []
|
||||
for lines in readDataList(fp,sep=" "):
|
||||
if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
|
||||
continue
|
||||
sentence = Sentence([],[],[],[])
|
||||
for line in lines:
|
||||
sentence.append(line[0],line[1],line[2],line[3])
|
||||
ret.append(sentence)
|
||||
return ret
|
||||
|
||||
def readKoreanDataAll():
|
||||
"""
|
||||
@return train, dev, test tuple
|
||||
Each entry is structured as follows:
|
||||
POS,
|
||||
Return: train, dev, test tuple
|
||||
"""
|
||||
dev = readKoreanData(f"{KoreanBase}/dev.txt")
|
||||
test = readKoreanData(f"{KoreanBase}/test.txt")
|
||||
train = readKoreanData(f"{KoreanBase}/train.txt")
|
||||
with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp:
|
||||
dev = readKoreanData(fp)
|
||||
with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp:
|
||||
test = readKoreanData(fp)
|
||||
with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
|
||||
train = readKoreanData(fp)
|
||||
return train, dev, test
|
||||
|
||||
def readEnglishDataAll():
|
||||
with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
|
||||
print("a")
|
||||
dev = readEnglishData(fp)
|
||||
print("b")
|
||||
with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
|
||||
test = readEnglishData(fp)
|
||||
with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
|
||||
train = readEnglishData(fp)
|
||||
return train, dev, test
|
||||
|
||||
class TagIdConverter:
|
||||
@ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c):
|
||||
break
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
"""
|
||||
extracts and stores tags set from the given data.
|
||||
"""
|
||||
@ -160,7 +175,7 @@ if __name__ == "__main__":
|
||||
from tqdm import tqdm
|
||||
t = TagIdConverter()
|
||||
|
||||
train, dev, test = readKoreanDataAll()
|
||||
train, dev, test = readEnglishDataAll()
|
||||
vocab = set()
|
||||
def getTags(lst: List[Sentence]):
|
||||
for s in tqdm(lst):
|
||||
@ -173,7 +188,6 @@ if __name__ == "__main__":
|
||||
print("get tags from test...")
|
||||
getTags(test)
|
||||
print(vocab)
|
||||
|
||||
for v in vocab:
|
||||
if v == "O":
|
||||
continue
|
||||
@ -183,7 +197,6 @@ if __name__ == "__main__":
|
||||
if not v in vocab:
|
||||
print("could not found pair " ,v)
|
||||
vocab.add(v)
|
||||
|
||||
tags = [{"name":"[PAD]","index":0}]
|
||||
i = 1
|
||||
vocab_list = [*vocab]
|
||||
@ -192,5 +205,5 @@ if __name__ == "__main__":
|
||||
tags.append({"name":v,"index":i})
|
||||
i += 1
|
||||
print(tags)
|
||||
with open("tags.json","w",encoding="utf-8") as fp:
|
||||
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
||||
#with open("tags.json","w",encoding="utf-8") as fp:
|
||||
# json.dump(tags,fp,ensure_ascii=False, indent=2)
|
Loading…
Reference in New Issue
Block a user