feat: support english
This commit is contained in:
parent
54e757c247
commit
28ddd289b7
306118
engpre/dev.json
Normal file
306118
engpre/dev.json
Normal file
File diff suppressed because it is too large
Load Diff
287552
engpre/test.json
Normal file
287552
engpre/test.json
Normal file
File diff suppressed because it is too large
Load Diff
1231776
engpre/train.json
Normal file
1231776
engpre/train.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,11 @@
|
|||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from read_data import TagIdConverter, make_long_namedEntity, readKoreanDataAll, Sentence
|
from read_data import TagIdConverter, make_long_namedEntity, readEnglishDataAll, readKoreanDataAll, Sentence
|
||||||
from typing import Any, NamedTuple, List, Sequence, TypeVar
|
from typing import Any, List
|
||||||
import json
|
import json
|
||||||
import os.path as path
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from transformers import PreTrainedTokenizer
|
from transformers import PreTrainedTokenizer
|
||||||
|
|
||||||
@ -35,27 +36,44 @@ def readPreprocessedData(path: str):
|
|||||||
with open(path,"r", encoding="utf-8") as fp:
|
with open(path,"r", encoding="utf-8") as fp:
|
||||||
return json.load(fp)
|
return json.load(fp)
|
||||||
|
|
||||||
def readPreporcssedDataAll():
|
def readPreporcssedDataAll(path = PRE_BASE_PATH):
|
||||||
train = readPreprocessedData(path.join(PRE_BASE_PATH,"train.json"))
|
train = readPreprocessedData(os.path.join(path,"train.json"))
|
||||||
dev = readPreprocessedData(path.join(PRE_BASE_PATH,"dev.json"))
|
dev = readPreprocessedData(os.path.join(path,"dev.json"))
|
||||||
test = readPreprocessedData(path.join(PRE_BASE_PATH,"test.json"))
|
test = readPreprocessedData(os.path.join(path,"test.json"))
|
||||||
return train, dev, test
|
return train, dev, test
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--kind", default="korean")
|
||||||
|
parser.add_argument("path",default=PRE_BASE_PATH,help="directory path of processed data")
|
||||||
|
parser.add_argument("--tag", default="tags.json",help="path of tag description")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
dirPath = args.path
|
||||||
|
|
||||||
|
if args.kind == "korean":
|
||||||
|
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
||||||
|
elif args.kind == "english":
|
||||||
|
rawTrain, rawDev, rawTest = readEnglishDataAll()
|
||||||
|
else:
|
||||||
|
print("unknown language",file=sys.stderr)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
converter = TagIdConverter(args.tag)
|
||||||
|
os.makedirs(dirPath)
|
||||||
|
|
||||||
from transformers import BertTokenizer
|
from transformers import BertTokenizer
|
||||||
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
|
||||||
|
|
||||||
rawTrain, rawDev, rawTest = readKoreanDataAll()
|
|
||||||
print("load tokenzier...",file=sys.stderr)
|
print("load tokenzier...",file=sys.stderr)
|
||||||
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
|
||||||
converter = TagIdConverter()
|
|
||||||
|
|
||||||
print("process train...",file=sys.stderr)
|
print("process train...",file=sys.stderr)
|
||||||
train = preprocessing(tokenizer,converter,rawTrain)
|
train = preprocessing(tokenizer,converter,rawTrain)
|
||||||
saveObject(path.join(PRE_BASE_PATH,"train.json"),train)
|
saveObject(path.join(dirPath,"train.json"),train)
|
||||||
print("process dev...",file=sys.stderr)
|
print("process dev...",file=sys.stderr)
|
||||||
dev = preprocessing(tokenizer,converter,rawDev)
|
dev = preprocessing(tokenizer,converter,rawDev)
|
||||||
saveObject(path.join(PRE_BASE_PATH,"dev.json"),dev)
|
saveObject(path.join(dirPath,"dev.json"),dev)
|
||||||
print("process test...",file=sys.stderr)
|
print("process test...",file=sys.stderr)
|
||||||
test = preprocessing(tokenizer,converter,rawTest)
|
test = preprocessing(tokenizer,converter,rawTest)
|
||||||
saveObject(path.join(PRE_BASE_PATH,"test.json"),test)
|
saveObject(path.join(dirPath,"test.json"),test)
|
||||||
|
Loading…
Reference in New Issue
Block a user