feat: count tag frequency
This commit is contained in:
parent
d2256b0ee9
commit
e080077a53
22
count_o.py
22
count_o.py
@ -1,22 +0,0 @@
|
|||||||
from dataset import readPreporcssedDataAll
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
count outside tokens(O tokens)
|
|
||||||
"""
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
train, _, _ = readPreporcssedDataAll()
|
|
||||||
|
|
||||||
total_l = 0
|
|
||||||
total_o = 0
|
|
||||||
|
|
||||||
for item in tqdm(train):
|
|
||||||
entities = item["entity"]
|
|
||||||
l = len(entities)
|
|
||||||
o = sum(map(lambda x: 1 if x == "O" else 0,entities))
|
|
||||||
total_l += l
|
|
||||||
total_o += o
|
|
||||||
|
|
||||||
print(f"{total_o}/{total_l} = {total_o/total_l}")
|
|
23
count_tag_freq.py
Normal file
23
count_tag_freq.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from dataset import readPreporcssedDataAll
|
||||||
|
from read_data import TagIdConverter
|
||||||
|
from tqdm import tqdm
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
"""
|
||||||
|
get frequency of tokens
|
||||||
|
"""
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train, _, _ = readPreporcssedDataAll()
|
||||||
|
tagIdConverter = TagIdConverter()
|
||||||
|
counter = Counter()
|
||||||
|
total_l = 0
|
||||||
|
|
||||||
|
for item in tqdm(train):
|
||||||
|
entities = item["entity"]
|
||||||
|
for entity in entities:
|
||||||
|
counter[entity] += 1
|
||||||
|
total_l += len(entities)
|
||||||
|
print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}")
|
||||||
|
for token,count in counter.most_common():
|
||||||
|
tid = tagIdConverter.convert_tokens_to_ids([token])[0]
|
||||||
|
print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%")
|
Loading…
Reference in New Issue
Block a user