Compare commits
No commits in common. "46f8d08fd7442e5655c3aaa5fc5d9a319d5a3518" and "92205ea7952fe2bf849e1ca5143fddfc9e0da6ab" have entirely different histories.
46f8d08fd7
...
92205ea795
1115
EngTraning.ipynb
1115
EngTraning.ipynb
File diff suppressed because one or more lines are too long
180
Training.ipynb
180
Training.ipynb
File diff suppressed because one or more lines are too long
@ -5,6 +5,8 @@ from read_data import TagIdConverter
|
|||||||
from preprocessing import readPreporcssedDataAll
|
from preprocessing import readPreporcssedDataAll
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
|
tagIdConverter = TagIdConverter()
|
||||||
|
|
||||||
class DatasetArray(Dataset):
|
class DatasetArray(Dataset):
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
self.x = data
|
self.x = data
|
||||||
@ -37,7 +39,7 @@ def wrap_sentence(tokenizer: PreTrainedTokenizer, sentence):
|
|||||||
def wrap_entities(tagIdConverter: TagIdConverter, entities):
|
def wrap_entities(tagIdConverter: TagIdConverter, entities):
|
||||||
return [tagIdConverter.O_id] + entities + [tagIdConverter.O_id]
|
return [tagIdConverter.O_id] + entities + [tagIdConverter.O_id]
|
||||||
|
|
||||||
def make_collate_fn(tokenizer: PreTrainedTokenizer, tagIdConverter: TagIdConverter):
|
def make_collate_fn(tokenizer: PreTrainedTokenizer):
|
||||||
def ret_fn(batch):
|
def ret_fn(batch):
|
||||||
words = [wrap_sentence(tokenizer,item["ids"]) for item in batch]
|
words = [wrap_sentence(tokenizer,item["ids"]) for item in batch]
|
||||||
entities = [wrap_entities(tagIdConverter,item["entity_ids"]) for item in batch]
|
entities = [wrap_entities(tagIdConverter,item["entity_ids"]) for item in batch]
|
||||||
|
@ -1,42 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"name": "[PAD]",
|
|
||||||
"index": 0
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B-LOC",
|
|
||||||
"index": 1
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B-MISC",
|
|
||||||
"index": 2
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B-ORG",
|
|
||||||
"index": 3
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "B-PER",
|
|
||||||
"index": 4
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "I-LOC",
|
|
||||||
"index": 5
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "I-MISC",
|
|
||||||
"index": 6
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "I-ORG",
|
|
||||||
"index": 7
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "I-PER",
|
|
||||||
"index": 8
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "O",
|
|
||||||
"index": 9
|
|
||||||
}
|
|
||||||
]
|
|
306118
engpre/dev.json
306118
engpre/dev.json
File diff suppressed because it is too large
Load Diff
287552
engpre/test.json
287552
engpre/test.json
File diff suppressed because it is too large
Load Diff
1231776
engpre/train.json
1231776
engpre/train.json
File diff suppressed because it is too large
Load Diff
@ -29,7 +29,6 @@ if __name__ == "__main__":
|
|||||||
test_list = [*range(20,-10,-1)]
|
test_list = [*range(20,-10,-1)]
|
||||||
for g in groupby_index(test_list,4):
|
for g in groupby_index(test_list,4):
|
||||||
print([*g])
|
print([*g])
|
||||||
print("===")
|
|
||||||
print([*map(lambda x:[*x],groupby_index([1,2,3,4],2))])
|
print([*map(lambda x:[*x],groupby_index([1,2,3,4],2))])
|
||||||
for g in groupby_index([1,2,3,4],2):
|
for g in groupby_index([1,2,3,4],2):
|
||||||
print([*g])
|
print([*g])
|
||||||
|
@ -1,17 +1,16 @@
|
|||||||
|
|
||||||
|
|
||||||
import argparse
|
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
|
||||||
import os
|
from typing import Any, NamedTuple, List, Sequence, TypeVar
|
||||||
import sys
|
|
||||||
from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
|
|
||||||
from typing import Any, List
|
|
||||||
import json
|
import json
|
||||||
|
import os.path as path
|
||||||
import tqdm
|
import tqdm
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
PRE_BASE_PATH = 'prepro'
|
PREPROCESSING_BASE_PATH = 'prepro'
|
||||||
|
converter = TagIdConverter()
|
||||||
|
|
||||||
def preprocessing(tokenizer : PreTrainedTokenizer, converter :TagIdConverter,dataset: List[Sentence]):
|
def preprocessing(tokenizer : PreTrainedTokenizer,dataset: List[Sentence]):
|
||||||
ret = []
|
ret = []
|
||||||
for item in tqdm.tqdm(dataset):
|
for item in tqdm.tqdm(dataset):
|
||||||
assert len(item.word) == len(item.detail)
|
assert len(item.word) == len(item.detail)
|
||||||
@ -36,44 +35,26 @@ def readPreprocessedData(path: str):
|
|||||||
with open(path,"r", encoding="utf-8") as fp:
|
with open(path,"r", encoding="utf-8") as fp:
|
||||||
return json.load(fp)
|
return json.load(fp)
|
||||||
|
|
||||||
def readPreporcssedDataAll(path = PRE_BASE_PATH):
|
def readPreporcssedDataAll():
|
||||||
train = readPreprocessedData(os.path.join(path,"train.json"))
|
train = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"train.json"))
|
||||||
dev = readPreprocessedData(os.path.join(path,"dev.json"))
|
dev = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"dev.json"))
|
||||||
test = readPreprocessedData(os.path.join(path,"test.json"))
|
test = readPreprocessedData(path.join(PREPROCESSING_BASE_PATH,"test.json"))
|
||||||
return train, dev, test
|
return train, dev, test
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--kind", default="korean")
|
|
||||||
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
|
|
||||||
parser.add_argument("--tag", default="tags.json",help="path of tag description")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
dirPath = args.path
|
|
||||||
|
|
||||||
if args.kind == "korean":
|
|
||||||
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
|
||||||
elif args.kind == "english":
|
|
||||||
rawTrain, rawDev, rawTest = readEnglishDataAll()
|
|
||||||
else:
|
|
||||||
print("unknown language",file=sys.stderr)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
converter = TagIdConverter(args.tag)
|
|
||||||
os.makedirs(dirPath)
|
|
||||||
|
|
||||||
from transformers import BertTokenizer
|
from transformers import BertTokenizer
|
||||||
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
||||||
|
|
||||||
print("load tokenzier...",file=sys.stderr)
|
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
||||||
|
print("load tokenzier...")
|
||||||
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
||||||
|
|
||||||
print("process train...",file=sys.stderr)
|
print("process train...")
|
||||||
train = preprocessing(tokenizer,converter,rawTrain)
|
train = preprocessing(tokenizer,rawTrain)
|
||||||
saveObject(path.join(dirPath,"train.json"),train)
|
saveObject(path.join(PREPROCESSING_BASE_PATH,"train.json"),train)
|
||||||
print("process dev...",file=sys.stderr)
|
print("process dev...")
|
||||||
dev = preprocessing(tokenizer,converter,rawDev)
|
dev = preprocessing(tokenizer,rawDev)
|
||||||
saveObject(path.join(dirPath,"dev.json"),dev)
|
saveObject(path.join(PREPROCESSING_BASE_PATH,"dev.json"),dev)
|
||||||
print("process test...",file=sys.stderr)
|
print("process test...")
|
||||||
test = preprocessing(tokenizer,converter,rawTest)
|
test = preprocessing(tokenizer,rawTest)
|
||||||
saveObject(path.join(dirPath,"test.json"),test)
|
saveObject(path.join(PREPROCESSING_BASE_PATH,"test.json"),test)
|
||||||
|
108
read_data.py
108
read_data.py
@ -1,9 +1,6 @@
|
|||||||
import enum
|
import enum
|
||||||
from io import TextIOWrapper
|
from typing import NamedTuple, List, Sequence, TypeVar
|
||||||
import sys
|
|
||||||
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
|
|
||||||
import json
|
import json
|
||||||
import argparse
|
|
||||||
|
|
||||||
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
||||||
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
|
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
|
||||||
@ -28,20 +25,29 @@ class Sentence(NamedTuple):
|
|||||||
self.namedEntity.append(namedEntity)
|
self.namedEntity.append(namedEntity)
|
||||||
self.detail.append(detail)
|
self.detail.append(detail)
|
||||||
T = TypeVar('T')
|
T = TypeVar('T')
|
||||||
def readDataList(lst: Iterable[str], sep="\t"):
|
def readDataList(lst: List[T]):
|
||||||
ret:List[str] = []
|
ret = []
|
||||||
for l in lst:
|
for l in lst:
|
||||||
l = l.strip()
|
if len(l) > 0:
|
||||||
if l == "":
|
ret.append(l)
|
||||||
|
else:
|
||||||
yield ret
|
yield ret
|
||||||
ret.clear()
|
ret.clear()
|
||||||
else:
|
|
||||||
ret.append(l.split(sep))
|
|
||||||
|
|
||||||
def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
|
def readKoreanData(path: str) -> List[Sentence]:
|
||||||
|
fp = open(path,encoding="utf-8")
|
||||||
|
data = []
|
||||||
|
for line in fp.readlines():
|
||||||
|
line = line.strip()
|
||||||
|
if line == "":
|
||||||
|
data.append([])
|
||||||
|
else:
|
||||||
|
data.append(line.split("\t"))
|
||||||
|
fp.close()
|
||||||
|
# Do not use csv reader.
|
||||||
ret = []
|
ret = []
|
||||||
# NOTE(monoid): Do not use csv reader.
|
|
||||||
for lines in readDataList(fp):
|
for lines in readDataList(data):
|
||||||
sentence = Sentence([],[],[],[])
|
sentence = Sentence([],[],[],[])
|
||||||
for line in lines:
|
for line in lines:
|
||||||
word_pos:str = line[0]
|
word_pos:str = line[0]
|
||||||
@ -49,41 +55,18 @@ def readKoreanData(fp: TextIOWrapper) -> List[Sentence]:
|
|||||||
sentence.append(words[0],line[1],line[2],line[3])
|
sentence.append(words[0],line[1],line[2],line[3])
|
||||||
ret.append(sentence)
|
ret.append(sentence)
|
||||||
|
|
||||||
fp.close()
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def readEnglishData(fp: TextIOWrapper) -> List[Sentence]:
|
|
||||||
ret = []
|
|
||||||
for lines in readDataList(fp,sep=" "):
|
|
||||||
if len(lines) == 1 and lines[0][0] == "-DOCSTART-":
|
|
||||||
continue
|
|
||||||
sentence = Sentence([],[],[],[])
|
|
||||||
for line in lines:
|
|
||||||
sentence.append(line[0],line[1],line[2],line[3])
|
|
||||||
ret.append(sentence)
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def readKoreanDataAll():
|
def readKoreanDataAll():
|
||||||
"""
|
"""
|
||||||
|
@return train, dev, test tuple
|
||||||
Each entry is structured as follows:
|
Each entry is structured as follows:
|
||||||
POS,
|
POS,
|
||||||
Return: train, dev, test tuple
|
|
||||||
"""
|
"""
|
||||||
with open(f"{KoreanBase}/dev.txt", encoding="utf-8") as fp:
|
dev = readKoreanData(f"{KoreanBase}/dev.txt")
|
||||||
dev = readKoreanData(fp)
|
test = readKoreanData(f"{KoreanBase}/test.txt")
|
||||||
with open(f"{KoreanBase}/test.txt", encoding="utf-8") as fp:
|
train = readKoreanData(f"{KoreanBase}/train.txt")
|
||||||
test = readKoreanData(fp)
|
|
||||||
with open(f"{KoreanBase}/train.txt", encoding="utf-8") as fp:
|
|
||||||
train = readKoreanData(fp)
|
|
||||||
return train, dev, test
|
|
||||||
|
|
||||||
def readEnglishDataAll():
|
|
||||||
with open(f"{EnglishBase}/valid.txt", encoding="utf-8") as fp:
|
|
||||||
dev = readEnglishData(fp)
|
|
||||||
with open(f"{EnglishBase}/test.txt", encoding="utf-8") as fp:
|
|
||||||
test = readEnglishData(fp)
|
|
||||||
with open(f"{EnglishBase}/train.txt", encoding="utf-8") as fp:
|
|
||||||
train = readEnglishData(fp)
|
|
||||||
return train, dev, test
|
return train, dev, test
|
||||||
|
|
||||||
class TagIdConverter:
|
class TagIdConverter:
|
||||||
@ -103,12 +86,6 @@ class TagIdConverter:
|
|||||||
@property
|
@property
|
||||||
def pad_id(self):
|
def pad_id(self):
|
||||||
return self.vocab["[PAD]"]
|
return self.vocab["[PAD]"]
|
||||||
@property
|
|
||||||
def size(self):
|
|
||||||
return len(self.vocab)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.size
|
|
||||||
|
|
||||||
def convert_ids_to_tokens(self,ids: List[int]):
|
def convert_ids_to_tokens(self,ids: List[int]):
|
||||||
return [self.ids_to_token[id] for id in ids]
|
return [self.ids_to_token[id] for id in ids]
|
||||||
@ -174,37 +151,29 @@ def make_long_namedEntity(a,b,c):
|
|||||||
break
|
break
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
extracts and stores tags set from the given data.
|
extracts and stores tags set from the given data.
|
||||||
"""
|
"""
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="create tags list")
|
|
||||||
parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english')
|
|
||||||
parser.add_argument("--stdout",action='store_true',help='print tags data to stdout')
|
|
||||||
parser.add_argument("--path",default="tags.json", help="path of tags data")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
if args.kind == "korean" or args.kind == "ko" or args.kind == "kor":
|
t = TagIdConverter()
|
||||||
train, dev, test = readEnglishDataAll()
|
|
||||||
elif args.kind == "english" or args.kind == "en" or args.kind =="eng":
|
train, dev, test = readKoreanDataAll()
|
||||||
train, dev, test = readKoreanDataAll()
|
|
||||||
else:
|
|
||||||
print("unknown language",file=sys.stderr)
|
|
||||||
exit(1)
|
|
||||||
vocab = set()
|
vocab = set()
|
||||||
def getTags(lst: List[Sentence]):
|
def getTags(lst: List[Sentence]):
|
||||||
for s in tqdm(lst):
|
for s in tqdm(lst):
|
||||||
for e in s.detail:
|
for e in s.detail:
|
||||||
vocab.add(e)
|
vocab.add(e)
|
||||||
print("get tags from train...",file=sys.stderr)
|
print("get tags from train...")
|
||||||
getTags(train)
|
getTags(train)
|
||||||
print("get tags from dev...",file=sys.stderr)
|
print("get tags from dev...")
|
||||||
getTags(dev)
|
getTags(dev)
|
||||||
print("get tags from test...",file=sys.stderr)
|
print("get tags from test...")
|
||||||
getTags(test)
|
getTags(test)
|
||||||
print(vocab,file=sys.stderr)
|
print(vocab)
|
||||||
|
|
||||||
for v in vocab:
|
for v in vocab:
|
||||||
if v == "O":
|
if v == "O":
|
||||||
continue
|
continue
|
||||||
@ -214,6 +183,7 @@ if __name__ == "__main__":
|
|||||||
if not v in vocab:
|
if not v in vocab:
|
||||||
print("could not found pair " ,v)
|
print("could not found pair " ,v)
|
||||||
vocab.add(v)
|
vocab.add(v)
|
||||||
|
|
||||||
tags = [{"name":"[PAD]","index":0}]
|
tags = [{"name":"[PAD]","index":0}]
|
||||||
i = 1
|
i = 1
|
||||||
vocab_list = [*vocab]
|
vocab_list = [*vocab]
|
||||||
@ -221,10 +191,6 @@ if __name__ == "__main__":
|
|||||||
for v in vocab_list:
|
for v in vocab_list:
|
||||||
tags.append({"name":v,"index":i})
|
tags.append({"name":v,"index":i})
|
||||||
i += 1
|
i += 1
|
||||||
print(tags,file=sys.stderr)
|
print(tags)
|
||||||
if args.stdout:
|
with open("tags.json","w",encoding="utf-8") as fp:
|
||||||
json.dump(tags,sys.stdout,ensure_ascii=False, indent=2)
|
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
||||||
else:
|
|
||||||
p = args.path
|
|
||||||
with open(p,"w",encoding="utf-8") as fp:
|
|
||||||
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
|
Loading…
Reference in New Issue
Block a user