feat: get args

This commit is contained in:
monoid 2022-02-22 17:20:16 +09:00
parent 609174b089
commit 142ad917bc

View File

@ -1,7 +1,9 @@
import enum import enum
from io import TextIOWrapper from io import TextIOWrapper
import sys
from typing import Iterable, NamedTuple, List, Sequence, TypeVar from typing import Iterable, NamedTuple, List, Sequence, TypeVar
import json import json
import argparse
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS" KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS" EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
@ -170,20 +172,33 @@ def make_long_namedEntity(a,b,c):
extracts and stores tags set from the given data. extracts and stores tags set from the given data.
""" """
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="create tags list")
parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english')
parser.add_argument("--stdout",action='store_true',help='print tags data to stdout')
parser.add_argument("--path",default="tags.json", help="path of tags data")
args = parser.parse_args()
from tqdm import tqdm from tqdm import tqdm
if args.kind == "korean" or args.kind == "ko" or args.kind == "kor":
train, dev, test = readEnglishDataAll() train, dev, test = readEnglishDataAll()
elif args.kind == "english" or args.kind == "en" or args.kind =="eng":
train, dev, test = readKoreanDataAll()
else:
print("unknown language",file=sys.stderr)
exit(1)
vocab = set() vocab = set()
def getTags(lst: List[Sentence]): def getTags(lst: List[Sentence]):
for s in tqdm(lst): for s in tqdm(lst):
for e in s.detail: for e in s.detail:
vocab.add(e) vocab.add(e)
print("get tags from train...") print("get tags from train...",file=sys.stderr)
getTags(train) getTags(train)
print("get tags from dev...") print("get tags from dev...",file=sys.stderr)
getTags(dev) getTags(dev)
print("get tags from test...") print("get tags from test...",file=sys.stderr)
getTags(test) getTags(test)
print(vocab) print(vocab,file=sys.stderr)
for v in vocab: for v in vocab:
if v == "O": if v == "O":
continue continue
@ -200,6 +215,10 @@ if __name__ == "__main__":
for v in vocab_list: for v in vocab_list:
tags.append({"name":v,"index":i}) tags.append({"name":v,"index":i})
i += 1 i += 1
print(tags) print(tags,file=sys.stderr)
with open("eng_tags.json","w",encoding="utf-8") as fp: if args.stdout:
json.dump(tags,sys.stdout,ensure_ascii=False, indent=2)
else:
p = args.path
with open(p,"w",encoding="utf-8") as fp:
json.dump(tags,fp,ensure_ascii=False, indent=2) json.dump(tags,fp,ensure_ascii=False, indent=2)