Upload 4 files
Browse files- bespoke/format.py +105 -0
- bespoke/format_annotated_jsonl.py +234 -0
- bespoke/format_unannotated_jsonl.py +160 -0
- bespoke/md_draft.txt +74 -0
bespoke/format.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
for LLAMA parsing:
|
| 3 |
+
takes a json of annotated minecraft games and converts to
|
| 4 |
+
a turn format to be used format_(un/annotated)_jsonl.py.
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def is_nl(textstr):
|
| 11 |
+
"""
|
| 12 |
+
Determine if an element is an EDU or EEU.
|
| 13 |
+
"""
|
| 14 |
+
colors = ['orange', 'blue', 'green', 'yellow', 'red', 'purple']
|
| 15 |
+
ans = 0
|
| 16 |
+
words = textstr.split(' ')
|
| 17 |
+
numerals = [n for n in words if n.isnumeric()]
|
| 18 |
+
colors = [c for c in words if c in colors]
|
| 19 |
+
if ('place' in words or 'pick' in words) and len(numerals) >= 3 and len(colors) > 0:
|
| 20 |
+
ans = 1
|
| 21 |
+
return ans
|
| 22 |
+
|
| 23 |
+
current_folder=os.getcwd()
|
| 24 |
+
|
| 25 |
+
data_path = current_folder + '/<orig_data>.json'
|
| 26 |
+
save_path = current_folder + '/<turns>.jsonl'
|
| 27 |
+
|
| 28 |
+
with open(data_path, 'r') as j:
|
| 29 |
+
jfile = json.load(j)
|
| 30 |
+
games = jfile
|
| 31 |
+
|
| 32 |
+
##for each game, find turns, edus.
|
| 33 |
+
##feed one turn at a time, with each edu numbered, plus structure for that turn
|
| 34 |
+
##TEXT: ##STRUCTURE: ##NEXT TURN => #output structure
|
| 35 |
+
turn_version = []
|
| 36 |
+
for game in games:
|
| 37 |
+
new_game = {}
|
| 38 |
+
new_game['id'] = game['id']
|
| 39 |
+
game_turns = []
|
| 40 |
+
edus = game['edus']
|
| 41 |
+
#the first edu is always the first turn.
|
| 42 |
+
turn_no = 0
|
| 43 |
+
last_speaker = None
|
| 44 |
+
new_turn = {}
|
| 45 |
+
new_turn['turn'] = turn_no
|
| 46 |
+
new_turn['speaker'] = edus[0]['speaker']
|
| 47 |
+
turn_edus = []
|
| 48 |
+
turn_edus.append(edus[0]['text'])
|
| 49 |
+
for edu in edus[1:]:
|
| 50 |
+
if edu['speaker'] == 'Architect':
|
| 51 |
+
if edu['speaker'] == last_speaker:
|
| 52 |
+
turn_edus.append(edu['text'])
|
| 53 |
+
else:
|
| 54 |
+
last_speaker = edu['speaker']
|
| 55 |
+
#finish and append last turn
|
| 56 |
+
new_turn['edus'] = turn_edus
|
| 57 |
+
game_turns.append(new_turn)
|
| 58 |
+
turn_no += 1
|
| 59 |
+
#now start a new turn!
|
| 60 |
+
new_turn = {}
|
| 61 |
+
new_turn['turn'] = turn_no
|
| 62 |
+
new_turn['speaker'] = last_speaker
|
| 63 |
+
turn_edus = [] #a list of edus from that turn
|
| 64 |
+
turn_edus.append(edu['text'])
|
| 65 |
+
else:
|
| 66 |
+
if is_nl(edu['text']):
|
| 67 |
+
#then this is an action sequence and should be it's own turn
|
| 68 |
+
last_speaker = None #need to do this so that builder actions turns are always their own turns
|
| 69 |
+
#finish and append last turn
|
| 70 |
+
new_turn['edus'] = turn_edus
|
| 71 |
+
game_turns.append(new_turn)
|
| 72 |
+
turn_no += 1
|
| 73 |
+
#now start a new turn!
|
| 74 |
+
new_turn = {}
|
| 75 |
+
new_turn['turn'] = turn_no
|
| 76 |
+
new_turn['speaker'] = 'Builder'
|
| 77 |
+
turn_edus = [] #a list of edus from that turn
|
| 78 |
+
turn_edus.append(edu['text'])
|
| 79 |
+
elif edu['speaker'] != last_speaker:
|
| 80 |
+
last_speaker = edu['speaker']
|
| 81 |
+
#finish and append last turn
|
| 82 |
+
new_turn['edus'] = turn_edus
|
| 83 |
+
game_turns.append(new_turn)
|
| 84 |
+
turn_no += 1
|
| 85 |
+
#now start a new turn!
|
| 86 |
+
new_turn = {}
|
| 87 |
+
new_turn['turn'] = turn_no
|
| 88 |
+
new_turn['speaker'] = last_speaker
|
| 89 |
+
turn_edus = [] #a list of edus from that turn
|
| 90 |
+
turn_edus.append(edu['text'])
|
| 91 |
+
else:
|
| 92 |
+
turn_edus.append(edu['text'])
|
| 93 |
+
#take care of last speaker turn in the game
|
| 94 |
+
new_turn['edus'] = turn_edus
|
| 95 |
+
game_turns.append(new_turn)
|
| 96 |
+
#append new turns to the game dict
|
| 97 |
+
new_game['turns'] = game_turns
|
| 98 |
+
#add game dict to list of games
|
| 99 |
+
turn_version.append(new_game)
|
| 100 |
+
|
| 101 |
+
with open(save_path, 'w') as outfile:
|
| 102 |
+
json.dump(turn_version, outfile)
|
| 103 |
+
|
| 104 |
+
print('json saved for {} games'.format(len(turn_version)))
|
| 105 |
+
|
bespoke/format_annotated_jsonl.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A dialogue is a list of samples, where each sample contains one new speaker turn.
|
| 3 |
+
|
| 4 |
+
takes a json of annotated minecraft games and converts to
|
| 5 |
+
a turn format to be used in LLAMA parsing.
|
| 6 |
+
|
| 7 |
+
NB: when creating jsonl, use '###PS' for 'predict structure'
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
import jsonlines
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def preprocess_edus(tlist):
|
| 17 |
+
"""
|
| 18 |
+
returns a list of lists, where each list contains the edus for a single turn.
|
| 19 |
+
Ex:
|
| 20 |
+
|
| 21 |
+
[...['6 <Buil> What is D2'],
|
| 22 |
+
['7 <Arch> Ah there is no stack,', '8 <Arch> pick up the washer'],...]
|
| 23 |
+
|
| 24 |
+
we see one turn contains the edu index 6, and the next turn contains the edus
|
| 25 |
+
with indexes 7 and 8.
|
| 26 |
+
|
| 27 |
+
NB: in a dialogue, might be best to change speakers to "Arch" and "Buil" to
|
| 28 |
+
reflect MSDC training data
|
| 29 |
+
"""
|
| 30 |
+
elist = []
|
| 31 |
+
cnt = 0
|
| 32 |
+
for turn in tlist:
|
| 33 |
+
speaker = turn['speaker'][:4]
|
| 34 |
+
#if needed, write code to change speaker names here
|
| 35 |
+
new_edus = []
|
| 36 |
+
for edu in turn['edus']:
|
| 37 |
+
new_string = str(cnt)+' '+'<'+speaker+'>'+' ' + edu
|
| 38 |
+
new_edus.append(new_string)
|
| 39 |
+
cnt += 1
|
| 40 |
+
elist.append(new_edus)
|
| 41 |
+
|
| 42 |
+
return elist
|
| 43 |
+
|
| 44 |
+
def get_windows(dial_turns, distance = 15):
|
| 45 |
+
"""
|
| 46 |
+
Takes the output from preprocess_edus() and
|
| 47 |
+
returns a list of index pairs. Each pair gives the delimiting indexes
|
| 48 |
+
for a window of turns whose total edus <= distance
|
| 49 |
+
|
| 50 |
+
Ex.
|
| 51 |
+
[(0, 11), (1, 12), (4, 13), (5, 14), ...]
|
| 52 |
+
|
| 53 |
+
Here, turns 0 through 11 contain edus <=distance, but once the edus from turn
|
| 54 |
+
12 are added, the window has to be adjusted in order for edus to remain <=distance.
|
| 55 |
+
The window must shifted from 1-12, then from 4-13, etc.
|
| 56 |
+
|
| 57 |
+
"""
|
| 58 |
+
edu_lens = [len(d) for d in dial_turns]
|
| 59 |
+
windows = []
|
| 60 |
+
esum = 0
|
| 61 |
+
first_cutoff = 0
|
| 62 |
+
for i, w in enumerate(edu_lens):
|
| 63 |
+
esum += w
|
| 64 |
+
if esum > distance:
|
| 65 |
+
first_cutoff = i - 1
|
| 66 |
+
break
|
| 67 |
+
windows.append((0, first_cutoff))
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
for i in range(first_cutoff + 1, len(edu_lens)):
|
| 71 |
+
#print(i)
|
| 72 |
+
esum = 0
|
| 73 |
+
for r in range(i, -1, -1):
|
| 74 |
+
esum += edu_lens[r]
|
| 75 |
+
if esum > distance:
|
| 76 |
+
# print(sum)
|
| 77 |
+
# print("new beg ", r+1)
|
| 78 |
+
windows.append((r+1,i))
|
| 79 |
+
break
|
| 80 |
+
|
| 81 |
+
# print(edu_lens)
|
| 82 |
+
# for w in windows:
|
| 83 |
+
# print(w)
|
| 84 |
+
# print(sum(edu_lens[w[0]:w[1]+1]))
|
| 85 |
+
return windows
|
| 86 |
+
|
| 87 |
+
def format_rels(index_list, rel_dict):
|
| 88 |
+
"""
|
| 89 |
+
Takes as input:
|
| 90 |
+
1. a list of lists, where each list corresponds to a dialogue
|
| 91 |
+
turn and contains the edu indexes for the edus in that turn.
|
| 92 |
+
2. a dict containing the relations for the dialogue
|
| 93 |
+
|
| 94 |
+
Returns a list of lists, where each list contains the relations
|
| 95 |
+
whose targets (y indexes) are the edus in each list.
|
| 96 |
+
|
| 97 |
+
Each relation is in the format [x index, y index, 'REL(x,y)'].
|
| 98 |
+
|
| 99 |
+
*NB we ignore backwards relations in the data
|
| 100 |
+
"""
|
| 101 |
+
map_rels_str = {'Comment':'COM', 'Contrast':'CONTR', 'Correction':'CORR', 'Question-answer_pair':'QAP',
|
| 102 |
+
'Acknowledgement':'ACK','Elaboration':'ELAB','Clarification_question':'CLARIFQ',
|
| 103 |
+
'Conditional':'COND', 'Continuation':'CONTIN', 'Result':'RES', 'Explanation':'EXPL',
|
| 104 |
+
'Q-Elab':'QELAB', 'Alternation':'ALT', 'Narration':'NARR',
|
| 105 |
+
'Confirmation_question':'CONFQ', 'Sequence':'SEQ'}
|
| 106 |
+
|
| 107 |
+
rel_list = []
|
| 108 |
+
for i in index_list:
|
| 109 |
+
i_list = []
|
| 110 |
+
slice = [s for s in rel_dict if s['y'] in i]
|
| 111 |
+
#find the relations that are
|
| 112 |
+
for s in slice:
|
| 113 |
+
if s['x'] < s['y']: #only take forward relations
|
| 114 |
+
new_s = []
|
| 115 |
+
new_s.append(s['x'])
|
| 116 |
+
new_s.append(s['y'])
|
| 117 |
+
#format the relation
|
| 118 |
+
new_s.append(map_rels_str[s['type']]+'('+ str(s['x'])+','+str(s['y']) +')')
|
| 119 |
+
i_list.append(new_s)
|
| 120 |
+
i_list = sorted(i_list, key= lambda x: x[1])
|
| 121 |
+
rel_list.append(i_list)
|
| 122 |
+
|
| 123 |
+
return rel_list
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
current_folder=os.getcwd()
|
| 127 |
+
|
| 128 |
+
data_turns_path = current_folder + '<turns>.json'
|
| 129 |
+
annotation_path = current_folder + '<orig_data>.json'
|
| 130 |
+
save_path = current_folder + '/<parser>.jsonl'
|
| 131 |
+
|
| 132 |
+
with open(data_turns_path, 'r') as j:
|
| 133 |
+
jfile = json.load(j)
|
| 134 |
+
dialogues = jfile
|
| 135 |
+
|
| 136 |
+
with open(annotation_path, 'r') as j:
|
| 137 |
+
jfile = json.load(j)
|
| 138 |
+
annotations = jfile
|
| 139 |
+
|
| 140 |
+
json_l = []
|
| 141 |
+
|
| 142 |
+
dialogue_count = 0
|
| 143 |
+
|
| 144 |
+
DISTANCE = 15
|
| 145 |
+
start_index = 0
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
for dial in dialogues:
|
| 149 |
+
dialogue_count += 1
|
| 150 |
+
dial_id = dial['id']
|
| 151 |
+
print(dial_id)
|
| 152 |
+
|
| 153 |
+
#if generating a test file for incremental parsing, add space marker between dialogues
|
| 154 |
+
#for any other files (test for gold parsing or train), remove this ---->
|
| 155 |
+
sample = {}
|
| 156 |
+
sample['PS'] = ""
|
| 157 |
+
sample['sample'] = "NEW DIALOGUE " + dial_id
|
| 158 |
+
json_l.append(sample)
|
| 159 |
+
#<-------------------------------
|
| 160 |
+
|
| 161 |
+
turns = preprocess_edus(dial['turns']) #preprocess game edus
|
| 162 |
+
|
| 163 |
+
windows = get_windows(turns, DISTANCE)
|
| 164 |
+
|
| 165 |
+
dial_rels = [a for a in annotations if a['id'] == dial_id][0]['relations']
|
| 166 |
+
|
| 167 |
+
turn_indexes = [[int(e.split('<')[0].strip()) for e in turn] for turn in turns]
|
| 168 |
+
|
| 169 |
+
relations = format_rels(turn_indexes, dial_rels)
|
| 170 |
+
|
| 171 |
+
#now add the relations for each turn to the original turns list
|
| 172 |
+
#the turns_plus_relations data structure is what we will use to create the data
|
| 173 |
+
turns_plus_relations = []
|
| 174 |
+
for i, t in enumerate(turns):
|
| 175 |
+
super_turn = []
|
| 176 |
+
super_turn.append(t)
|
| 177 |
+
super_turn.append(relations[i])
|
| 178 |
+
turns_plus_relations.append(super_turn)
|
| 179 |
+
|
| 180 |
+
#start with first window
|
| 181 |
+
global_context = []
|
| 182 |
+
structure = []
|
| 183 |
+
global_context.extend(turns_plus_relations[0][0]) #add 0 turn "mission has started"
|
| 184 |
+
for t in turns_plus_relations[1:windows[0][1]+1]: #go through each subsequent turn in first window and create a new sample
|
| 185 |
+
sample = {}
|
| 186 |
+
c = "\n".join(global_context)
|
| 187 |
+
n = "\n".join(t[0])
|
| 188 |
+
|
| 189 |
+
#find all the relations that have (0,n) as their indexes
|
| 190 |
+
rels_list = [r[2] for r in t[1]]
|
| 191 |
+
r = ' '.join(rels_list)
|
| 192 |
+
s = ' '.join(structure)
|
| 193 |
+
|
| 194 |
+
sample['PS'] = r
|
| 195 |
+
sample['sample'] = 'Context: ' + c + '\nStructure: ' + s + '\nNew Turn: ' + n
|
| 196 |
+
json_l.append(sample)
|
| 197 |
+
|
| 198 |
+
global_context.extend(t[0])
|
| 199 |
+
structure.extend(rels_list)
|
| 200 |
+
|
| 201 |
+
#now for each new turn added beyond the first window, we need to adjust the context window
|
| 202 |
+
for window in windows[1:]:
|
| 203 |
+
|
| 204 |
+
#find min index for this window
|
| 205 |
+
min_x = min([int(t.split('<')[0].strip()) for t in turns_plus_relations[window[0]][0]])
|
| 206 |
+
|
| 207 |
+
global_context = []
|
| 208 |
+
structure = []
|
| 209 |
+
for tw in turns_plus_relations[window[0]:window[1]]:
|
| 210 |
+
global_context.extend(tw[0])
|
| 211 |
+
#need to include only the structure with x indexes less than or equal to the new cutoff!!!
|
| 212 |
+
structure.extend([rel[2] for rel in tw[1] if rel[0] >= min_x])
|
| 213 |
+
|
| 214 |
+
sample = {}
|
| 215 |
+
c = "\n".join(global_context)
|
| 216 |
+
n = "\n".join(turns_plus_relations[window[1]][0])
|
| 217 |
+
|
| 218 |
+
rels_list = [r[2] for r in turns_plus_relations[window[1]][1] if r[0] >= min_x] # this will be the predicted relations, but need to ensure cutoff!!!
|
| 219 |
+
r = ' '.join(rels_list) #it's adding the r from the previous turn ???
|
| 220 |
+
s = ' '.join(structure)
|
| 221 |
+
|
| 222 |
+
sample['PS'] = r
|
| 223 |
+
sample['sample'] = 'Context: ' + c + '\nStructure: ' + s + '\nNew Turn: ' + n
|
| 224 |
+
json_l.append(sample)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
#convert the dicts into json dicts for json_l
|
| 228 |
+
with jsonlines.open(save_path, mode='w') as writer:
|
| 229 |
+
for x in json_l:
|
| 230 |
+
writer.write(x)
|
| 231 |
+
|
| 232 |
+
print('jsonl saved for {} games'.format(dialogue_count))
|
| 233 |
+
|
| 234 |
+
|
bespoke/format_unannotated_jsonl.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A dialogue is a list of samples, where each sample contains one new speaker turn.
|
| 3 |
+
|
| 4 |
+
takes a json of annotated minecraft games and converts to
|
| 5 |
+
a turn format to be used in LLAMA parsing.
|
| 6 |
+
|
| 7 |
+
NB: when creating json-l, use '###PS' for 'predict structure'
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
import jsonlines
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def preprocess_edus(tlist):
|
| 17 |
+
"""
|
| 18 |
+
returns a list of lists, where each list contains the edus for a single turn.
|
| 19 |
+
Ex:
|
| 20 |
+
|
| 21 |
+
[...['6 <Buil> What is D2'],
|
| 22 |
+
['7 <Arch> Ah there is no stack,', '8 <Arch> pick up the washer'],...]
|
| 23 |
+
|
| 24 |
+
we see one turn contains the edu index 6, and the next turn contains the edus
|
| 25 |
+
with indexes 7 and 8.
|
| 26 |
+
|
| 27 |
+
NB: in a dialogue, might be best to change speakers to "Arch" and "Buil" to
|
| 28 |
+
reflect MSDC training data
|
| 29 |
+
"""
|
| 30 |
+
elist = []
|
| 31 |
+
|
| 32 |
+
cnt = 0
|
| 33 |
+
|
| 34 |
+
for turn in tlist:
|
| 35 |
+
speaker = turn['speaker'][:4]
|
| 36 |
+
#write code to change speaker names here
|
| 37 |
+
|
| 38 |
+
new_edus = []
|
| 39 |
+
for edu in turn['edus']:
|
| 40 |
+
new_string = str(cnt)+' '+'<'+speaker+'>'+' ' + edu
|
| 41 |
+
new_edus.append(new_string)
|
| 42 |
+
cnt += 1
|
| 43 |
+
|
| 44 |
+
elist.append(new_edus)
|
| 45 |
+
|
| 46 |
+
return elist
|
| 47 |
+
|
| 48 |
+
def get_windows(dial_turns, distance = 15):
|
| 49 |
+
"""
|
| 50 |
+
Takes the output from preprocess_edus() and
|
| 51 |
+
returns a list of index pairs. Each pair gives the delimiting indexes
|
| 52 |
+
for a window of turns whose total edus <= distance
|
| 53 |
+
|
| 54 |
+
Ex.
|
| 55 |
+
[(0, 11), (1, 12), (4, 13), (5, 14), ...]
|
| 56 |
+
|
| 57 |
+
Here, turns 0 through 11 contain edus <=distance, but once the edus from turn
|
| 58 |
+
12 are added, the window has to be adjusted in order for edus to remain <=distance.
|
| 59 |
+
The window must shifted from 1-12, then from 4-13, etc.
|
| 60 |
+
|
| 61 |
+
"""
|
| 62 |
+
edu_lens = [len(d) for d in dial_turns]
|
| 63 |
+
windows = []
|
| 64 |
+
esum = 0
|
| 65 |
+
first_cutoff = 0
|
| 66 |
+
for i, w in enumerate(edu_lens):
|
| 67 |
+
esum += w
|
| 68 |
+
if esum > distance:
|
| 69 |
+
first_cutoff = i - 1
|
| 70 |
+
break
|
| 71 |
+
windows.append((0, first_cutoff))
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
for i in range(first_cutoff + 1, len(edu_lens)):
|
| 75 |
+
#print(i)
|
| 76 |
+
esum = 0
|
| 77 |
+
for r in range(i, -1, -1):
|
| 78 |
+
esum += edu_lens[r]
|
| 79 |
+
if esum > distance:
|
| 80 |
+
# print(sum)
|
| 81 |
+
# print("new beg ", r+1)
|
| 82 |
+
windows.append((r+1,i))
|
| 83 |
+
break
|
| 84 |
+
|
| 85 |
+
return windows
|
| 86 |
+
|
| 87 |
+
current_folder=os.getcwd()
|
| 88 |
+
|
| 89 |
+
data_path = current_folder + '/<turns>.json'
|
| 90 |
+
save_path = current_folder + '/<parser>.jsonl'
|
| 91 |
+
|
| 92 |
+
with open(data_path, 'r') as j:
|
| 93 |
+
jfile = json.load(j)
|
| 94 |
+
dialogues = jfile
|
| 95 |
+
|
| 96 |
+
json_l = []
|
| 97 |
+
|
| 98 |
+
dialogue_count = 0
|
| 99 |
+
|
| 100 |
+
DISTANCE = 15
|
| 101 |
+
start_index = 0
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
for dial in dialogues:
|
| 105 |
+
dialogue_count += 1
|
| 106 |
+
dial_id = dial['id']
|
| 107 |
+
print(dial_id)
|
| 108 |
+
|
| 109 |
+
#if generating a test file for incremental parsing, add space marker between dialogues
|
| 110 |
+
#for any other files (test for gold parsing or train), remove this ---->
|
| 111 |
+
sample = {}
|
| 112 |
+
sample['PS'] = ""
|
| 113 |
+
sample['sample'] = "NEW DIALOGUE " + dial_id
|
| 114 |
+
json_l.append(sample)
|
| 115 |
+
#<-------------------------------
|
| 116 |
+
|
| 117 |
+
turns = preprocess_edus(dial['turns']) #preprocess game edus
|
| 118 |
+
print(turns)
|
| 119 |
+
|
| 120 |
+
windows = get_windows(turns, DISTANCE)
|
| 121 |
+
print('------------------')
|
| 122 |
+
print(windows)
|
| 123 |
+
|
| 124 |
+
#start with first window
|
| 125 |
+
global_context = []
|
| 126 |
+
global_context.extend(turns[0]) #add 0 turn "mission has started"
|
| 127 |
+
for t in turns[1:windows[0][1]+1]: #go through each subsequent turn in first window and create a new sample
|
| 128 |
+
sample = {}
|
| 129 |
+
c = "\n".join(global_context)
|
| 130 |
+
n = "\n".join(t)
|
| 131 |
+
|
| 132 |
+
sample['PS'] = ""
|
| 133 |
+
sample['sample'] = 'Context: ' + c + "\nStructure: \nNew Turn: " + n
|
| 134 |
+
json_l.append(sample)
|
| 135 |
+
|
| 136 |
+
global_context.extend(t)
|
| 137 |
+
|
| 138 |
+
#now for each new turn added beyond the first window, we need to adjust the context window
|
| 139 |
+
for window in windows[1:]:
|
| 140 |
+
#print(window)
|
| 141 |
+
global_context = []
|
| 142 |
+
for tw in turns[window[0]:window[1]]:
|
| 143 |
+
global_context.extend(tw)
|
| 144 |
+
sample = {}
|
| 145 |
+
c = "\n".join(global_context)
|
| 146 |
+
n = "\n".join(turns[window[1]])
|
| 147 |
+
|
| 148 |
+
sample['PS'] = ""
|
| 149 |
+
sample['sample'] = 'Context: ' + c + "\nStructure: \nNew Turn: " + n
|
| 150 |
+
json_l.append(sample)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
#convert the dicts into json dicts for json_l
|
| 154 |
+
with jsonlines.open(save_path, mode='w') as writer:
|
| 155 |
+
for x in json_l:
|
| 156 |
+
writer.write(x)
|
| 157 |
+
|
| 158 |
+
print('jsonl saved for {} games'.format(dialogue_count))
|
| 159 |
+
|
| 160 |
+
|
bespoke/md_draft.txt
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Formatting your dialogue data for the Llamipa parser.
|
| 3 |
+
|
| 4 |
+
This is a collection of scripts which can regenerate the Llamipa data from the MSDC, or can help you to format your
|
| 5 |
+
own dialogue data for use with the Llamipa parser.
|
| 6 |
+
|
| 7 |
+
To start, the dialogue data must follow the MSDC format, where each dialogue is a json object, with "id"
|
| 8 |
+
and "edus" fields. If the dialgoue is already annotated for discourse structure, a "relations" fields
|
| 9 |
+
(see the corpus: https://huggingface.co/datasets/linagora/MinecraftStructuredDialogueCorpus).
|
| 10 |
+
|
| 11 |
+
**Make sure to include a dummy 0 move, "Mission has Started",
|
| 12 |
+
at the beginning of each dialogue.
|
| 13 |
+
|
| 14 |
+
## STEP 1: Use the dialogue json data to create an intermediate format, where each
|
| 15 |
+
speaker turn is a single object containing all discourse units.
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
[
|
| 19 |
+
"id": "log3566",
|
| 20 |
+
"turns": [
|
| 21 |
+
{
|
| 22 |
+
"turn": 0,
|
| 23 |
+
"speaker": "Builder",
|
| 24 |
+
"edus": [
|
| 25 |
+
"Mission has started."
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"turn": 1,
|
| 30 |
+
"speaker": "Architect",
|
| 31 |
+
"edus": [
|
| 32 |
+
"Hi!"
|
| 33 |
+
]
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"turn": 2,
|
| 37 |
+
"speaker": "Builder",
|
| 38 |
+
"edus": [
|
| 39 |
+
"Hi", "What are we building today?"
|
| 40 |
+
]
|
| 41 |
+
},...
|
| 42 |
+
]
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
The format.py script takes the dialogue json as input, and outputs a <turns>.json file.
|
| 46 |
+
|
| 47 |
+
Note: The script assumes the non-linguistic actions are of the same format as in the MSDC, e.g.:
|
| 48 |
+
|
| 49 |
+
{
|
| 50 |
+
"turn": 22,
|
| 51 |
+
"speaker": "Builder",
|
| 52 |
+
"edus": ["place purple 5 1 -5, place purple 5 1 -4, place purple 5 2 -5, place purple 4 1 -5"]
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
## STEP 2:
|
| 57 |
+
|
| 58 |
+
If using *UNANNOTATED* data, use the <turns>.json to create a <parser>.jsonl
|
| 59 |
+
file formatted for Llamipa. Script: `format_unannotated.py`
|
| 60 |
+
|
| 61 |
+
If using *ANNOTATED* DATA, use the <turns>.json and the original data json to create a <parser>.jsonl
|
| 62 |
+
file formatted for Llamipa. Script: `format_annotated.py`
|
| 63 |
+
|
| 64 |
+
**Make sure that the relation type representations in the `map_rels_str` dictionary in the `format_rels`
|
| 65 |
+
function match those in your data.
|
| 66 |
+
|
| 67 |
+
The DISTANCE variable is set to 15 edus, which is what was used for Llamipa training
|
| 68 |
+
and testing, but can be changed to support contexts of different lengths.
|
| 69 |
+
|
| 70 |
+
Note: If generating data for incremental parsing, make sure to add the space marker between
|
| 71 |
+
dialogues (line 109 in `format_unannotated.py` and line 153 in `format_annotated.py`). Otherwise, comment
|
| 72 |
+
this out if generating data for testing the parser with gold structure context.
|
| 73 |
+
|
| 74 |
+
|