feat: support english

This commit is contained in:
monoid 2022-02-22 18:33:29 +09:00
parent 54e757c247
commit 28ddd289b7
4 changed files with 1825476 additions and 12 deletions

306118
engpre/dev.json Normal file

File diff suppressed because it is too large Load Diff

287552
engpre/test.json Normal file

File diff suppressed because it is too large Load Diff

1231776
engpre/train.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,11 @@
import argparse
import os
import sys
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
from typing import Any, NamedTuple, List, Sequence, TypeVar
from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
from typing import Any, List
import json
import os.path as path
import tqdm
from transformers import PreTrainedTokenizer
@ -35,27 +36,44 @@ def readPreprocessedData(path: str):
with open(path,"r", encoding="utf-8") as fp:
return json.load(fp)
def readPreporcssedDataAll():
train = readPreprocessedData(path.join(PRE_BASE_PATH,"train.json"))
dev = readPreprocessedData(path.join(PRE_BASE_PATH,"dev.json"))
test = readPreprocessedData(path.join(PRE_BASE_PATH,"test.json"))
def readPreporcssedDataAll(path = PRE_BASE_PATH):
train = readPreprocessedData(os.path.join(path,"train.json"))
dev = readPreprocessedData(os.path.join(path,"dev.json"))
test = readPreprocessedData(os.path.join(path,"test.json"))
return train, dev, test
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--kind", default="korean")
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
parser.add_argument("--tag", default="tags.json",help="path of tag description")
args = parser.parse_args()
dirPath = args.path
if args.kind == "korean":
rawTrain, rawDev, rawTest = readKoreanDataAll()
elif args.kind == "english":
rawTrain, rawDev, rawTest = readEnglishDataAll()
else:
print("unknown language",file=sys.stderr)
exit(1)
converter = TagIdConverter(args.tag)
os.makedirs(dirPath)
from transformers import BertTokenizer
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
rawTrain, rawDev, rawTest = readKoreanDataAll()
print("load tokenzier...",file=sys.stderr)
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
converter = TagIdConverter()
print("process train...",file=sys.stderr)
train = preprocessing(tokenizer,converter,rawTrain)
saveObject(path.join(PRE_BASE_PATH,"train.json"),train)
saveObject(path.join(dirPath,"train.json"),train)
print("process dev...",file=sys.stderr)
dev = preprocessing(tokenizer,converter,rawDev)
saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev)
saveObject(path.join(dirPath,"dev.json"),dev)
print("process test...",file=sys.stderr)
test = preprocessing(tokenizer,converter,rawTest)
saveObject(path.join(PRE_BASE_PATH,"test.json"),test)
saveObject(path.join(dirPath,"test.json"),test)