Compare commits

..

No commits in common. "46f8d08fd7442e5655c3aaa5fc5d9a319d5a3518" and "92205ea7952fe2bf849e1ca5143fddfc9e0da6ab" have entirely different histories.

10 changed files with 144 additions and 1826815 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -5,6 +5,8 @@ from read_data import TagIdConverter
from preprocessing import readPreporcssedDataAll from preprocessing import readPreporcssedDataAll
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
tagIdConverter = TagIdConverter()
class DatasetArray(Dataset): class DatasetArray(Dataset):
def __init__(self, data): def __init__(self, data):
self.x = data self.x = data
@ -37,7 +39,7 @@ def wrap_sentence(tokenizer: PreTrainedTokenizer, sentence):
def wrap_entities(tagIdConverter: TagIdConverter, entities): def wrap_entities(tagIdConverter: TagIdConverter, entities):
return [tagIdConverter.O_id] + entities + [tagIdConverter.O_id] return [tagIdConverter.O_id] + entities + [tagIdConverter.O_id]
def make_collate_fn(tokenizer: PreTrainedTokenizer, tagIdConverter: TagIdConverter): def make_collate_fn(tokenizer: PreTrainedTokenizer):
def ret_fn(batch): def ret_fn(batch):
words = [wrap_sentence(tokenizer,item["ids"]) for item in batch] words = [wrap_sentence(tokenizer,item["ids"]) for item in batch]
entities = [wrap_entities(tagIdConverter,item["entity_ids"]) for item in batch] entities = [wrap_entities(tagIdConverter,item["entity_ids"]) for item in batch]

View File

@ -1,42 +0,0 @@
[
{
"name": "[PAD]",
"index": 0
},
{
"name": "B-LOC",
"index": 1
},
{
"name": "B-MISC",
"index": 2
},
{
"name": "B-ORG",
"index": 3
},
{
"name": "B-PER",
"index": 4
},
{
"name": "I-LOC",
"index": 5
},
{
"name": "I-MISC",
"index": 6
},
{
"name": "I-ORG",
"index": 7
},
{
"name": "I-PER",
"index": 8
},
{
"name": "O",
"index": 9
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,7 +29,6 @@ if __name__ == "__main__":
test_list = [*range(20,-10,-1)] test_list = [*range(20,-10,-1)]
for g in groupby_index(test_list,4): for g in groupby_index(test_list,4):
print([*g]) print([*g])
print("===")
print([*map(lambda x:[*x],groupby_index([1,2,3,4],2))]) print([*map(lambda x:[*x],groupby_index([1,2,3,4],2))])
for g in groupby_index([1,2,3,4],2): for g in groupby_index([1,2,3,4],2):
print([*g]) print([*g])

View File

@ -1,17 +1,16 @@
import argparse from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
import os from typing import Any, NamedTuple, List, Sequence, TypeVar
import sys
from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
from typing import Any, List
import json import json
import os.path as path
import tqdm import tqdm
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
PRE_BASE_PATH = 'prepro' PREPROCESSING_BASE_PATH = 'prepro'
converter = TagIdConverter()
def preprocessing(tokenizer : PreTrainedTokenizer, converter :TagIdConverter,dataset: List[Sentence]): def preprocessing(tokenizer : PreTrainedTokenizer,dataset: List[Sentence]):
ret = [] ret = []
for item in tqdm.tqdm(dataset): for item in tqdm.tqdm(dataset):
assert len(item.word) == len(item.detail) assert len(item.word) == len(item.detail)
@ -36,44 +35,26 @@ def readPreprocessedData(path: str):
with open(path,"r", encoding="utf-8") as fp: with open(path,"r", encoding="utf-8") as fp:
return json.load(fp) return json.load(fp)
def readPreporcssedDataAll(path = PRE_BASE_PATH): def readPreporcssedDataAll():
train = readPreprocessedData(os.path.join(path,"train.json")) train = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"train.json"))
dev = readPreprocessedData(os.path.join(path,"dev.json")) dev = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"dev.json"))
test = readPreprocessedData(os.path.join(path,"test.json")) test = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"test.json"))
return train, dev, test return train, dev, test
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--kind", default="korean")
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
parser.add_argument("--tag", default="tags.json",help="path of tag description")
args = parser.parse_args()
dirPath = args.path
if args.kind == "korean":
rawTrain, rawDev, rawTest = readKoreanDataAll()
elif args.kind == "english":
rawTrain, rawDev, rawTest = readEnglishDataAll()
else:
print("unknown language",file=sys.stderr)
exit(1)
converter = TagIdConverter(args.tag)
os.makedirs(dirPath)
from transformers import BertTokenizer from transformers import BertTokenizer
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased' PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
print("load tokenzier...",file=sys.stderr) rawTrain, rawDev, rawTest = readKoreanDataAll()
print("load tokenzier...")
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME) tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
print("process train...",file=sys.stderr) print("process train...")
train = preprocessing(tokenizer,converter,rawTrain) train = preprocessing(tokenizer,rawTrain)
saveObject(path.join(dirPath,"train.json"),train) saveObject(path.join(PREPROCESSING_BASE_PATH,"train.json"),train)
print("process dev...",file=sys.stderr) print("process dev...")
dev = preprocessing(tokenizer,converter,rawDev) dev = preprocessing(tokenizer,rawDev)
saveObject(path.join(dirPath,"dev.json"),dev) saveObject(path.join(PREPROCESSING_BASE_PATH,"dev.json"),dev)
print("process test...",file=sys.stderr) print("process test...")
test = preprocessing(tokenizer,converter,rawTest) test = preprocessing(tokenizer,rawTest)
saveObject(path.join(dirPath,"test.json"),test) saveObject(path.join(PREPROCESSING_BASE_PATH,"test.json"),test)

View File

@ -1,9 +1,6 @@
import enum import enum
from io import TextIOWrapper from typing import NamedTuple, List, Sequence, TypeVar
import sys
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
import json import json
import argparse
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS" EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
@ -28,20 +25,29 @@ class Sentence(NamedTuple):
self.namedEntity.append(namedEntity) self.namedEntity.append(namedEntity)
self.detail.append(detail) self.detail.append(detail)
T = TypeVar('T') T = TypeVar('T')
def readDataList(lst: Iterable[str], sep="\t"): def readDataList(lst: List[T]):
ret:List[str] = [] ret = []
for l in lst: for l in lst:
l = l.strip() if len(l) > 0:
if l == "": ret.append(l)
else:
yield ret yield ret
ret.clear() ret.clear()
else:
ret.append(l.split(sep))
def readKoreanData(fp: TextIOWrapper) -> List[Sentence]: def readKoreanData(path: str) -> List[Sentence]:
fp = open(path,encoding="utf-8")
data = []
for line in fp.readlines():
line = line.strip()
if line == "":
data.append([])
else:
data.append(line.split("\t"))
fp.close()
# Do not use csv reader.
ret = [] ret = []
# NOTE(monoid): Do not use csv reader.
for lines in readDataList(fp): for lines in readDataList(data):
sentence = Sentence([],[],[],[]) sentence = Sentence([],[],[],[])
for line in lines: for line in lines:
word_pos:str = line[0] word_pos:str = line[0]
@ -49,41 +55,18 @@ def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
sentence.append(words[0],line[1],line[2],line[3]) sentence.append(words[0],line[1],line[2],line[3])
ret.append(sentence) ret.append(sentence)
fp.close()
return ret return ret
def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
ret = []
for lines in readDataList(fp,sep=" "):
if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
continue
sentence = Sentence([],[],[],[])
for line in lines:
sentence.append(line[0],line[1],line[2],line[3])
ret.append(sentence)
return ret
def readKoreanDataAll(): def readKoreanDataAll():
""" """
@return train, dev, test tuple
Each entry is structured as follows: Each entry is structured as follows:
POS, POS,
Return: train, dev, test tuple
""" """
with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp: dev = readKoreanData(f"{KoreanBase}/dev.txt")
dev = readKoreanData(fp) test = readKoreanData(f"{KoreanBase}/test.txt")
with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp: train = readKoreanData(f"{KoreanBase}/train.txt")
test = readKoreanData(fp)
with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
train = readKoreanData(fp)
return train, dev, test
def readEnglishDataAll():
with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
dev = readEnglishData(fp)
with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
test = readEnglishData(fp)
with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
train = readEnglishData(fp)
return train, dev, test return train, dev, test
class TagIdConverter: class TagIdConverter:
@ -103,12 +86,6 @@ class TagIdConverter:
@property @property
def pad_id(self): def pad_id(self):
return self.vocab["[PAD]"] return self.vocab["[PAD]"]
@property
def size(self):
return len(self.vocab)
def __len__(self):
return self.size
def convert_ids_to_tokens(self,ids: List[int]): def convert_ids_to_tokens(self,ids: List[int]):
return [self.ids_to_token[id] for id in ids] return [self.ids_to_token[id] for id in ids]
@ -174,37 +151,29 @@ def make_long_namedEntity(a,b,c):
break break
return ret return ret
""" """
extracts and stores tags set from the given data. extracts and stores tags set from the given data.
""" """
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create tags list")
parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english')
parser.add_argument("--stdout",action='store_true',help='print tags data to stdout')
parser.add_argument("--path",default="tags.json", help="path of tags data")
args = parser.parse_args()
from tqdm import tqdm from tqdm import tqdm
if args.kind == "korean" or args.kind == "ko" or args.kind == "kor": t = TagIdConverter()
train, dev, test = readEnglishDataAll()
elif args.kind == "english" or args.kind == "en" or args.kind =="eng": train, dev, test = readKoreanDataAll()
train, dev, test = readKoreanDataAll()
else:
print("unknown language",file=sys.stderr)
exit(1)
vocab = set() vocab = set()
def getTags(lst: List[Sentence]): def getTags(lst: List[Sentence]):
for s in tqdm(lst): for s in tqdm(lst):
for e in s.detail: for e in s.detail:
vocab.add(e) vocab.add(e)
print("get tags from train...",file=sys.stderr) print("get tags from train...")
getTags(train) getTags(train)
print("get tags from dev...",file=sys.stderr) print("get tags from dev...")
getTags(dev) getTags(dev)
print("get tags from test...",file=sys.stderr) print("get tags from test...")
getTags(test) getTags(test)
print(vocab,file=sys.stderr) print(vocab)
for v in vocab: for v in vocab:
if v == "O": if v == "O":
continue continue
@ -214,6 +183,7 @@ if __name__ == "__main__":
if not v in vocab: if not v in vocab:
print("could not found pair " ,v) print("could not found pair " ,v)
vocab.add(v) vocab.add(v)
tags = [{"name":"[PAD]","index":0}] tags = [{"name":"[PAD]","index":0}]
i = 1 i = 1
vocab_list = [*vocab] vocab_list = [*vocab]
@ -221,10 +191,6 @@ if __name__ == "__main__":
for v in vocab_list: for v in vocab_list:
tags.append({"name":v,"index":i}) tags.append({"name":v,"index":i})
i += 1 i += 1
print(tags,file=sys.stderr) print(tags)
if args.stdout: with open("tags.json","w",encoding="utf-8") as fp:
json.dump(tags,sys.stdout,ensure_ascii=False, indent=2) json.dump(tags,fp,ensure_ascii=False, indent=2)
else:
p = args.path
with open(p,"w",encoding="utf-8") as fp:
json.dump(tags,fp,ensure_ascii=False, indent=2)