feat: count tag frequency
This commit is contained in:
		
							parent
							
								
									d2256b0ee9
								
							
						
					
					
						commit
						e080077a53
					
				
					 2 changed files with 23 additions and 22 deletions
				
			
		
							
								
								
									
										22
									
								
								count_o.py
									
										
									
									
									
								
							
							
						
						
									
										22
									
								
								count_o.py
									
										
									
									
									
								
							|  | @ -1,22 +0,0 @@ | |||
| from dataset import readPreporcssedDataAll | ||||
| from tqdm import tqdm | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| count outside tokens(O tokens) | ||||
| """ | ||||
| if __name__ == "__main__": | ||||
| 
 | ||||
|     train, _, _ = readPreporcssedDataAll() | ||||
| 
 | ||||
|     total_l = 0 | ||||
|     total_o = 0 | ||||
| 
 | ||||
|     for item in tqdm(train): | ||||
|         entities = item["entity"] | ||||
|         l = len(entities) | ||||
|         o = sum(map(lambda x: 1 if x == "O" else 0,entities)) | ||||
|         total_l += l | ||||
|         total_o += o | ||||
| 
 | ||||
|     print(f"{total_o}/{total_l} = {total_o/total_l}") | ||||
							
								
								
									
										23
									
								
								count_tag_freq.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								count_tag_freq.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| from dataset import readPreporcssedDataAll | ||||
| from read_data import TagIdConverter | ||||
| from tqdm import tqdm | ||||
| from collections import Counter | ||||
| 
 | ||||
| """ | ||||
| get frequency of tokens | ||||
| """ | ||||
| if __name__ == "__main__": | ||||
|     train, _, _ = readPreporcssedDataAll() | ||||
|     tagIdConverter = TagIdConverter() | ||||
|     counter = Counter() | ||||
|     total_l = 0 | ||||
| 
 | ||||
|     for item in tqdm(train): | ||||
|         entities = item["entity"] | ||||
|         for entity in entities: | ||||
|             counter[entity] += 1 | ||||
|         total_l += len(entities) | ||||
|     print(f"{'token':<12}\t{'count':>12} {'frequency%':>12}") | ||||
|     for token,count in counter.most_common(): | ||||
|         tid = tagIdConverter.convert_tokens_to_ids([token])[0] | ||||
|         print(f"{f'{token}({tid})':<12}\t{count:>12}{count*100/total_l:>12.3f}%") | ||||
		Loading…
	
	Add table
		
		Reference in a new issue