feat(read_data): add english data
This commit is contained in:
parent
bb1e0b5c64
commit
883f39d645
75
read_data.py
75
read_data.py
@ -1,5 +1,6 @@
|
|||||||
import enum
|
import enum
|
||||||
from typing import NamedTuple, List, Sequence, TypeVar
|
from io import TextIOWrapper
|
||||||
|
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
|
||||||
import json
|
import json
|
||||||
|
|
||||||
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
||||||
@ -25,48 +26,64 @@ class Sentence(NamedTuple):
|
|||||||
self.namedEntity.append(namedEntity)
|
self.namedEntity.append(namedEntity)
|
||||||
self.detail.append(detail)
|
self.detail.append(detail)
|
||||||
T = TypeVar('T')
|
T = TypeVar('T')
|
||||||
def readDataList(lst: List[T]):
|
def readDataList(lst: Iterable[str], sep="\t"):
|
||||||
ret = []
|
ret:List[str] = []
|
||||||
for l in lst:
|
for l in lst:
|
||||||
if len(l) > 0:
|
l = l.strip()
|
||||||
ret.append(l)
|
if l == "":
|
||||||
else:
|
|
||||||
yield ret
|
yield ret
|
||||||
ret.clear()
|
ret.clear()
|
||||||
|
|
||||||
def readKoreanData(path: str) -> List[Sentence]:
|
|
||||||
fp = open(path,encoding="utf-8")
|
|
||||||
data = []
|
|
||||||
for line in fp.readlines():
|
|
||||||
line = line.strip()
|
|
||||||
if line == "":
|
|
||||||
data.append([])
|
|
||||||
else:
|
else:
|
||||||
data.append(line.split("\t"))
|
ret.append(l.split(sep))
|
||||||
fp.close()
|
|
||||||
# Do not use csv reader.
|
|
||||||
ret = []
|
|
||||||
|
|
||||||
for lines in readDataList(data):
|
def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
|
||||||
|
ret = []
|
||||||
|
# NOTE(monoid): Do not use csv reader.
|
||||||
|
for lines in readDataList(fp):
|
||||||
sentence = Sentence([],[],[],[])
|
sentence = Sentence([],[],[],[])
|
||||||
for line in lines:
|
for line in lines:
|
||||||
word_pos:str = line[0]
|
word_pos:str = line[0]
|
||||||
words = word_pos.split("/")
|
words = word_pos.split("/")
|
||||||
sentence.append(words[0],line[1],line[2],line[3])
|
sentence.append(words[0],line[1],line[2],line[3])
|
||||||
ret.append(sentence)
|
ret.append(sentence)
|
||||||
|
|
||||||
|
fp.close()
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
|
||||||
|
ret = []
|
||||||
|
for lines in readDataList(fp,sep=" "):
|
||||||
|
if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
|
||||||
|
continue
|
||||||
|
sentence = Sentence([],[],[],[])
|
||||||
|
for line in lines:
|
||||||
|
sentence.append(line[0],line[1],line[2],line[3])
|
||||||
|
ret.append(sentence)
|
||||||
|
return ret
|
||||||
|
|
||||||
def readKoreanDataAll():
|
def readKoreanDataAll():
|
||||||
"""
|
"""
|
||||||
@return train, dev, test tuple
|
|
||||||
Each entry is structured as follows:
|
Each entry is structured as follows:
|
||||||
POS,
|
POS,
|
||||||
|
Return: train, dev, test tuple
|
||||||
"""
|
"""
|
||||||
dev = readKoreanData(f"{KoreanBase}/dev.txt")
|
with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp:
|
||||||
test = readKoreanData(f"{KoreanBase}/test.txt")
|
dev = readKoreanData(fp)
|
||||||
train = readKoreanData(f"{KoreanBase}/train.txt")
|
with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp:
|
||||||
|
test = readKoreanData(fp)
|
||||||
|
with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
|
||||||
|
train = readKoreanData(fp)
|
||||||
|
return train, dev, test
|
||||||
|
|
||||||
|
def readEnglishDataAll():
|
||||||
|
with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
|
||||||
|
print("a")
|
||||||
|
dev = readEnglishData(fp)
|
||||||
|
print("b")
|
||||||
|
with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
|
||||||
|
test = readEnglishData(fp)
|
||||||
|
with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
|
||||||
|
train = readEnglishData(fp)
|
||||||
return train, dev, test
|
return train, dev, test
|
||||||
|
|
||||||
class TagIdConverter:
|
class TagIdConverter:
|
||||||
@ -151,8 +168,6 @@ def make_long_namedEntity(a,b,c):
|
|||||||
break
|
break
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
extracts and stores tags set from the given data.
|
extracts and stores tags set from the given data.
|
||||||
"""
|
"""
|
||||||
@ -160,7 +175,7 @@ if __name__ == "__main__":
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
t = TagIdConverter()
|
t = TagIdConverter()
|
||||||
|
|
||||||
train, dev, test = readKoreanDataAll()
|
train, dev, test = readEnglishDataAll()
|
||||||
vocab = set()
|
vocab = set()
|
||||||
def getTags(lst: List[Sentence]):
|
def getTags(lst: List[Sentence]):
|
||||||
for s in tqdm(lst):
|
for s in tqdm(lst):
|
||||||
@ -173,7 +188,6 @@ if __name__ == "__main__":
|
|||||||
print("get tags from test...")
|
print("get tags from test...")
|
||||||
getTags(test)
|
getTags(test)
|
||||||
print(vocab)
|
print(vocab)
|
||||||
|
|
||||||
for v in vocab:
|
for v in vocab:
|
||||||
if v == "O":
|
if v == "O":
|
||||||
continue
|
continue
|
||||||
@ -183,7 +197,6 @@ if __name__ == "__main__":
|
|||||||
if not v in vocab:
|
if not v in vocab:
|
||||||
print("could not found pair " ,v)
|
print("could not found pair " ,v)
|
||||||
vocab.add(v)
|
vocab.add(v)
|
||||||
|
|
||||||
tags = [{"name":"[PAD]","index":0}]
|
tags = [{"name":"[PAD]","index":0}]
|
||||||
i = 1
|
i = 1
|
||||||
vocab_list = [*vocab]
|
vocab_list = [*vocab]
|
||||||
@ -192,5 +205,5 @@ if __name__ == "__main__":
|
|||||||
tags.append({"name":v,"index":i})
|
tags.append({"name":v,"index":i})
|
||||||
i += 1
|
i += 1
|
||||||
print(tags)
|
print(tags)
|
||||||
with open("tags.json","w",encoding="utf-8") as fp:
|
#with open("tags.json","w",encoding="utf-8") as fp:
|
||||||
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
# json.dump(tags,fp,ensure_ascii=False, indent=2)
|
Loading…
Reference in New Issue
Block a user