From 87e9cc39e01d835eeef82f2ab03f833cbbd4667c Mon Sep 17 00:00:00 2001
From: James S <james@femboyfinancial.jp>
Date: Mon, 18 Nov 2024 23:25:18 +0000
Subject: [PATCH] Anonymize usernames again, and try including reply context

---
 data/process.js | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/data/process.js b/data/process.js
index 4461d5d..8244360 100644
--- a/data/process.js
+++ b/data/process.js
@@ -2,7 +2,7 @@ const fs = require('node:fs');
 const JSONStream = require('JSONStream');
 
 const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
-const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for messages chains which have NO reactions
+const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 const USERNAMES = [
     'vinny volcano\uD83C\uDF0B (伊胜焱)',
     'Server Comp!',
@@ -33,6 +33,7 @@ async function main() {
     let convoMsgSeqCount = 0;
     let convoReactCount = 0;
     let convoMsgs = [];
+    let convoRefs = {};
 
     stream.on('data', async (msg) => {
         // no bot/system messages
@@ -40,18 +41,6 @@ async function main() {
             return;
         }
 
-        /**
-         * Replies are a tricky case. I considered pasting their context in, except this
-         * requires keeping a full message cache and won't scale. Another possibility is
-         * to maintain a small sliding window of message history and delete replies which
-         * reference a message too far in the past... but what if that reply gets replied
-         * to right after? Our chat in particular has a lot of these "necro" messages, but
-         * since they tend to spark further discussion if anything, they probably don't
-         * noticeably obfuscate the flow of conversation compared to normal time skips,
-         * which our model is incapable of picking up in the first place.
-         * TLDR: Keep the replies. Making too many assumptions is bad.
-         */
-
         // scrub links
         let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
         // scrub @mentions
@@ -62,6 +51,18 @@ async function main() {
             return;
         }
 
+        /**
+         * Handle replies by maintaining a sliding window of message references.
+         * If the replied-to message is too old to be part of this conversation, then leave this
+         * message alone.
+         * If it's recent, then embed it as context for this message, using the old-fashioned
+         * reply syntax: "> original message \n reply message"
+         */
+        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
+            const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
+            cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
+        }
+
         // count reaction
         convoReactCount += msg.reactions.length;
 
@@ -87,14 +88,16 @@ async function main() {
             }
             convoMsgSeqCount = convoReactCount = 0;
             convoMsgs = [];
+            convoRefs = {};
         }
 
         // follow chatML chat template
         const outMsg = {
-            role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
+            role: botAuthoredMsgSequence ? "assistant" : "user",
             content: cleanContent
         };
         convoMsgs.push(outMsg);
+        convoRefs[msg.id] = cleanContent;
 
         if (++counter % 1000 === 0) {
             console.log(counter + " messages written");