label_type_mapped = {
"no_relation": []
,"org:dissolved": [('ORG', 'DAT')] # 지정된 조직이 해산된 날짜 O
,"org:founded": [('ORG', 'DAT')] # 지정된 조직이 설립된 날짜 O
,"org:place_of_headquarters": [('ORG', 'LOC'), ('ORG', 'ORG'), ('ORG', 'POH')] # 지정된 조직의 본부가 있는 장소(본사위치)
,"org:alternate_names": [('ORG', 'ORG'), ('ORG', 'POH')] # 지정된 조직을 참조하기 위해 사무실 이름 대신 호출되는 대체 이름
,"org:member_of": [('ORG', 'ORG'), ('ORG', 'POH'), ('ORG', 'LOC')]
,"org:members": [('ORG', 'ORG'), ('ORG', 'POH'), ('ORG', 'LOC')]
,"org:political/religious_affiliation": [('ORG', 'ORG'), ('ORG', 'POH')]
,"org:product": [('ORG', 'POH')]
,"org:founded_by": [('ORG', 'PER')]
,"org:top_members/employees": [('ORG', 'PER')]
,"org:number_of_employees/members": [('ORG','NOH')]
,"per:date_of_birth": [('PER', 'DAT')]
,"per:date_of_death": [('PER', 'DAT')]
,"per:place_of_birth": [('PER', 'LOC')]
,"per:place_of_death": [('PER', 'LOC')]
,"per:place_of_residence": [('PER', 'LOC')]
,"per:origin": [('PER', 'LOC'), ('PER', 'ORG')]
,"per:employee_of": [('PER', 'ORG')]
,"per:schools_attended": [('PER', 'ORG')]
,"per:alternate_names": [('PER', 'PER')]
,"per:parents": [('PER', 'PER'), ('PER', 'POH')]
,"per:children": [('PER', 'PER'), ('PER', 'POH')]
,"per:siblings": [('PER', 'PER'), ('PER', 'POH')]
,"per:spouse": [('PER', 'PER'), ('PER', 'POH')]
,"per:other_family": [('PER', 'PER'), ('PER', 'POH')]
,"per:colleagues": [('PER', 'PER')]
,"per:product": [('PER', 'POH')]
,"per:religion": [('PER', 'ORG'), ('PER', 'POH')]
,"per:title": [('PER', 'POH')]
}
import random
random.seed(42)
# Insert punction words into a given sentence with the given ratio "punc_ratio"
def insert_punctuation_marks(original_data, punc_ratio=0.3):
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
new_data = original_data.copy()
if original_data.subject_idx < original_data.object_idx:
sentence_s = original_data.sentence[:original_data.subject_idx[0]]
sentence_m = original_data.sentence[original_data.subject_idx[1]+1:original_data.object_idx[0]]
sentence_e = original_data.sentence[original_data.object_idx[1]+1:]
else:
sentence_s = original_data.sentence[:original_data.object_idx[0]]
sentence_m = original_data.sentence[original_data.object_idx[1]+1:original_data.subject_idx[0]]
sentence_e = original_data.sentence[original_data.subject_idx[1]+1:]
words_s = sentence_s.split(' ')
words_m = sentence_m.split(' ')
words_e = sentence_e.split(' ')
new_line = []
words_length = len(words_s) + len(words_m) + len(words_e)
q = random.randint(1, int(punc_ratio * words_length + 1))
qs = random.sample(range(0, words_length), q)
j = 0
for idx, words in enumerate([words_s, words_m, words_e]):
for word in words:
if j in qs:
new_line.append(PUNCTUATIONS[random.randint(0, len(PUNCTUATIONS)-1)])
new_line.append(word)
else:
new_line.append(word)
j += 1
length = 0 if len(new_line) == 0 else sum([len(w) for w in new_line]) + len(new_line)
if idx == 0:
if original_data.subject_idx < original_data.object_idx:
new_data.subject_idx = (length, length+len(original_data.subject_word)-1)
new_line.append(original_data.subject_word)
else:
new_data.object_idx = (length, length+len(original_data.object_word)-1)
new_line.append(original_data.object_word)
elif idx == 1:
if original_data.subject_idx < original_data.object_idx:
new_data.object_idx = (length, length+len(original_data.object_word)-1)
new_line.append(original_data.object_word)
else:
new_data.subject_idx = (length, length+len(original_data.subject_word)-1)
new_line.append(original_data.subject_word)
new_data.sentence = ' '.join(new_line)
return new_data
model_name = klue/bert-base
optimizer_name = AdamW
scheduler_name = CosineAnnealingLR
loss_name = CrossEntropy_weighted
num_train_epochs = 10
learning_rate = 5e-5
batch_size = 64
warmup_steps = 500
weight_decay = 0.01
early_stopping = 3 # with eval/loss
random_state= 42
eval_steps = 500
seed = 42
오늘은 코드를 작성하는데 시간이 다소 많이 소요되어서, 시도해보고자 했던 것들을 하지 못하였다. 내일은 수업을 들으면서 증강, 정제 python 코드 정리 + 모델 테스트를 진행해보면 어떨까 싶다. (roberta-large가 좋다는 소문소문)