feat: get args
This commit is contained in:
parent
609174b089
commit
142ad917bc
31
read_data.py
31
read_data.py
@ -1,7 +1,9 @@
|
|||||||
import enum
|
import enum
|
||||||
from io import TextIOWrapper
|
from io import TextIOWrapper
|
||||||
|
import sys
|
||||||
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
|
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
|
||||||
import json
|
import json
|
||||||
|
import argparse
|
||||||
|
|
||||||
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
||||||
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
|
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
|
||||||
@ -170,20 +172,33 @@ def make_long_namedEntity(a,b,c):
|
|||||||
extracts and stores tags set from the given data.
|
extracts and stores tags set from the given data.
|
||||||
"""
|
"""
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="create tags list")
|
||||||
|
parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english')
|
||||||
|
parser.add_argument("--stdout",action='store_true',help='print tags data to stdout')
|
||||||
|
parser.add_argument("--path",default="tags.json", help="path of tags data")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
if args.kind == "korean" or args.kind == "ko" or args.kind == "kor":
|
||||||
train, dev, test = readEnglishDataAll()
|
train, dev, test = readEnglishDataAll()
|
||||||
|
elif args.kind == "english" or args.kind == "en" or args.kind =="eng":
|
||||||
|
train, dev, test = readKoreanDataAll()
|
||||||
|
else:
|
||||||
|
print("unknown language",file=sys.stderr)
|
||||||
|
exit(1)
|
||||||
vocab = set()
|
vocab = set()
|
||||||
def getTags(lst: List[Sentence]):
|
def getTags(lst: List[Sentence]):
|
||||||
for s in tqdm(lst):
|
for s in tqdm(lst):
|
||||||
for e in s.detail:
|
for e in s.detail:
|
||||||
vocab.add(e)
|
vocab.add(e)
|
||||||
print("get tags from train...")
|
print("get tags from train...",file=sys.stderr)
|
||||||
getTags(train)
|
getTags(train)
|
||||||
print("get tags from dev...")
|
print("get tags from dev...",file=sys.stderr)
|
||||||
getTags(dev)
|
getTags(dev)
|
||||||
print("get tags from test...")
|
print("get tags from test...",file=sys.stderr)
|
||||||
getTags(test)
|
getTags(test)
|
||||||
print(vocab)
|
print(vocab,file=sys.stderr)
|
||||||
for v in vocab:
|
for v in vocab:
|
||||||
if v == "O":
|
if v == "O":
|
||||||
continue
|
continue
|
||||||
@ -200,6 +215,10 @@ if __name__ == "__main__":
|
|||||||
for v in vocab_list:
|
for v in vocab_list:
|
||||||
tags.append({"name":v,"index":i})
|
tags.append({"name":v,"index":i})
|
||||||
i += 1
|
i += 1
|
||||||
print(tags)
|
print(tags,file=sys.stderr)
|
||||||
with open("eng_tags.json","w",encoding="utf-8") as fp:
|
if args.stdout:
|
||||||
|
json.dump(tags,sys.stdout,ensure_ascii=False, indent=2)
|
||||||
|
else:
|
||||||
|
p = args.path
|
||||||
|
with open(p,"w",encoding="utf-8") as fp:
|
||||||
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
Loading…
Reference in New Issue
Block a user