diff --git a/data/procToxicQA.js b/data/procToxicQA.js
index 7ecc75e..dec788d 100644
--- a/data/procToxicQA.js
+++ b/data/procToxicQA.js
@@ -1,9 +1,17 @@
+/**
+ * procToxicQA.js
+ * This script assumes you have ToxicQA (https://huggingface.co/datasets/NobodyExistsOnTheInternet/toxicqa/blob/main/finalToxicQA.jsonl)
+ * downloaded at 'toxicQA.json'.
+ */
+
 const fs = require('node:fs');
 var lineReader = require('readline').createInterface({
     input: fs.createReadStream('toxicQA.json')
 });
 var outstream = fs.createWriteStream('toxicQAfinal.json');
-fs.unlinkSync('toxicQAfinal.json');
+if (fs.existsSync('toxicQAfinal.json')) {
+    fs.unlinkSync('toxicQAfinal.json');
+}
 
 lineReader.on('line', function (line) {
     const dialogue = JSON.parse(line)["conversations"];
diff --git a/data/process.js b/data/process.js
index 8244360..9ec52cc 100644
--- a/data/process.js
+++ b/data/process.js
@@ -3,7 +3,7 @@ const JSONStream = require('JSONStream');
 
 const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
-const USERNAMES = [
+const SAVED_USERNAMES = [       // usernames as they were recorded in the chat log (@nickname)
     'vinny volcano\uD83C\uDF0B (伊胜焱)',
     'Server Comp!',
     'Make The Map \uD83D\uDDFA',
@@ -18,7 +18,23 @@ const USERNAMES = [
     'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
     'Nicolaid',
     'epbic',
+    'Cap’n Vincent 🏴☠🏝',
+    '1715 Galleonpilled Skipchud ⚓🦜',
+    'me gold doubloons🏴☠🏆',
+    'Boatswain Samuel ⚓⛵ 🌊'
 ];
+const REAL_NAMES = {        // username to real name mapping
+    'vinso1445': 'Vincent Iannelli',
+    'scoliono': 'James Shiffer',
+    'gnuwu': 'David Zheng',
+    'f0oby': 'Myles Linden',
+    'bapazheng': 'Myles Linden',
+    'bapabakshi': 'Myles Linden',
+    'keliande27': 'Myles Linden',
+    '1thinker': 'Samuel Habib',
+    'adam28405': 'Adam Kazerounian',
+    'shibe.mp4': 'Jake Wong'
+};
 
 async function main() {
     let counter = 0;
@@ -32,9 +48,77 @@ async function main() {
     let botAuthoredMsgSequence;
     let convoMsgSeqCount = 0;
     let convoReactCount = 0;
-    let convoMsgs = [];
+    let promptMsg = [];     // group of formatted msg seqs to be written to one line of the final dataset
+    let discordMsgs = [];    // a single formatted message sequence
     let convoRefs = {};
 
+
+    /**
+     * Langchain structured output
+     *
+     * Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so:
+     *
+     * ```
+     * USER:
+     * Answer the user query.
+     * [ Langchain JSON structured output instructions ]
+     * { ... "author": "vinso", "content": "message history 1" ... }
+     * { ... "author": "f0oby", "content": "message history 2" ... }
+     * { ... "author": "scoliono", "content": "message history 3" ... }
+     *
+     *
+     * ASSISTANT:
+     * { ... "author": "Hatsune Miku", "content": "message history 1" ... }
+     * ```
+     *
+     * To this end, we have a function to format Discord messages in the same format as the
+     * Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.)
+     *
+     * Each turn by the user or assistant in the LLM-level conversation is henceforth known as a
+     * "prompt message". The individual JSON lines in this example are supposed to represent
+     * Discord messages, with one prompt message containing a "message sequence"'s worth. In the
+     * actual JSONL dataset, though, one line represents 10 message sequences.
+     *
+     * Note: the training data will sometimes have multiple Discord messages in a single assistant
+     * message sequence. Although it may seem unorthodox to have an LLM double-text you, this is
+     * supposed to emulate a real Discord conversation, and real people have a tendency to split up
+     * a thought across multiple messages. It's up to the inference code to decide what to do with
+     * this.
+     */
+    function structurePrompt(msg, cleanContent) {
+        /**
+         * Handle replies by maintaining a sliding window of message references.
+         * If the replied-to message is too old to be part of this conversation, then leave this
+         * message alone. If it's recent, then embed it as context for this message.
+         */
+        let repliedToContent;
+        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
+            repliedToContent = convoRefs[msg.reference.messageId];
+        }
+        // record reactions the message got in a compact string form
+        let reactionString;
+        for (const reaction of msg.reactions) {
+            if (reactionString === undefined) {
+                reactionString = '';
+            }
+            if (reactionString && reactionString.length > 0) {
+                reactionString += ', ';
+            }
+            reactionString += `:${reaction.emoji.code}: (${reaction.count})`;
+        }
+
+        // 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
+        return JSON.stringify({
+            timestamp: (new Date(msg.timestamp)).toUTCString(),
+            author: msg.author.name,
+            name: REAL_NAMES[msg.author.name],
+            context: repliedToContent,
+            content: cleanContent,
+            reactions: reactionString
+        });
+    }
+
+
     stream.on('data', async (msg) => {
         // no bot/system messages
         if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
@@ -44,25 +128,13 @@ async function main() {
         // scrub links
         let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
         // scrub @mentions
-        for (const username of USERNAMES) {
+        for (const username of SAVED_USERNAMES) {
             cleanContent = cleanContent.replaceAll(`@${username}`, "");
         }
         if (!cleanContent) {
             return;
         }
 
-        /**
-         * Handle replies by maintaining a sliding window of message references.
-         * If the replied-to message is too old to be part of this conversation, then leave this
-         * message alone.
-         * If it's recent, then embed it as context for this message, using the old-fashioned
-         * reply syntax: "> original message \n reply message"
-         */
-        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
-            const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
-            cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
-        }
-
         // count reaction
         convoReactCount += msg.reactions.length;
 
@@ -71,6 +143,14 @@ async function main() {
         if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
             lastMsgAuthor = msg.author.id;
 
+            // follow chatML chat template when writing to prompt
+            promptMsg.push({
+                role: botAuthoredMsgSequence ? 'assistant' : 'user',
+                content: discordMsgs.join('\n')
+            });
+
+            discordMsgs = [];
+
             // bot will pretend to author a random number of msg sequences
             botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
 
@@ -78,35 +158,34 @@ async function main() {
         }
         lastMsgTime = msgTime;
 
-        // 10 msg sequences per "conversation"
+        // 10 msg sequences per prompt message
         if (convoMsgSeqCount === 10) {
             // dropout
             const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
             if (convoKeep) {
                 // write JSONL format
-                fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
+                fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
             }
             convoMsgSeqCount = convoReactCount = 0;
-            convoMsgs = [];
+            promptMsg = [];
+            discordMsgs = [];
             convoRefs = {};
         }
 
-        // follow chatML chat template
-        const outMsg = {
-            role: botAuthoredMsgSequence ? "assistant" : "user",
-            content: cleanContent
-        };
-        convoMsgs.push(outMsg);
         convoRefs[msg.id] = cleanContent;
 
+        // write a single discord message to the prompt
+        discordMsgs.push(structurePrompt(msg, cleanContent));
+
         if (++counter % 1000 === 0) {
             console.log(counter + " messages written");
         }
     });
 
+
     stream.on('close', async () => {
-        if (convoMsgs.length) {
-            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
+        if (promptMsg.length) {
+            fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
         }
         console.log("Done!");
     });