feat: get args
This commit is contained in:
parent
609174b089
commit
142ad917bc
35
read_data.py
35
read_data.py
@ -1,7 +1,9 @@
|
||||
import enum
|
||||
from io import TextIOWrapper
|
||||
import sys
|
||||
from typing import Iterable, NamedTuple, List, Sequence, TypeVar
|
||||
import json
|
||||
import argparse
|
||||
|
||||
KoreanBase="[Ko, En] NER, POStag data/국문 NER, POS"
|
||||
EnglishBase="[Ko, En] NER, POStag data/영문 NER, POS"
|
||||
@ -170,20 +172,33 @@ def make_long_namedEntity(a,b,c):
|
||||
extracts and stores tags set from the given data.
|
||||
"""
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="create tags list")
|
||||
parser.add_argument("--kind","-k",default='korean', help='kind of language: korean or english')
|
||||
parser.add_argument("--stdout",action='store_true',help='print tags data to stdout')
|
||||
parser.add_argument("--path",default="tags.json", help="path of tags data")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
from tqdm import tqdm
|
||||
train, dev, test = readEnglishDataAll()
|
||||
if args.kind == "korean" or args.kind == "ko" or args.kind == "kor":
|
||||
train, dev, test = readEnglishDataAll()
|
||||
elif args.kind == "english" or args.kind == "en" or args.kind =="eng":
|
||||
train, dev, test = readKoreanDataAll()
|
||||
else:
|
||||
print("unknown language",file=sys.stderr)
|
||||
exit(1)
|
||||
vocab = set()
|
||||
def getTags(lst: List[Sentence]):
|
||||
for s in tqdm(lst):
|
||||
for e in s.detail:
|
||||
vocab.add(e)
|
||||
print("get tags from train...")
|
||||
print("get tags from train...",file=sys.stderr)
|
||||
getTags(train)
|
||||
print("get tags from dev...")
|
||||
print("get tags from dev...",file=sys.stderr)
|
||||
getTags(dev)
|
||||
print("get tags from test...")
|
||||
print("get tags from test...",file=sys.stderr)
|
||||
getTags(test)
|
||||
print(vocab)
|
||||
print(vocab,file=sys.stderr)
|
||||
for v in vocab:
|
||||
if v == "O":
|
||||
continue
|
||||
@ -200,6 +215,10 @@ if __name__ == "__main__":
|
||||
for v in vocab_list:
|
||||
tags.append({"name":v,"index":i})
|
||||
i += 1
|
||||
print(tags)
|
||||
with open("eng_tags.json","w",encoding="utf-8") as fp:
|
||||
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
||||
print(tags,file=sys.stderr)
|
||||
if args.stdout:
|
||||
json.dump(tags,sys.stdout,ensure_ascii=False, indent=2)
|
||||
else:
|
||||
p = args.path
|
||||
with open(p,"w",encoding="utf-8") as fp:
|
||||
json.dump(tags,fp,ensure_ascii=False, indent=2)
|
Loading…
Reference in New Issue
Block a user