From 87e9cc39e01d835eeef82f2ab03f833cbbd4667c Mon Sep 17 00:00:00 2001 From: James S Date: Mon, 18 Nov 2024 23:25:18 +0000 Subject: [PATCH] Anonymize usernames again, and try including reply context --- data/process.js | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/data/process.js b/data/process.js index 4461d5d..8244360 100644 --- a/data/process.js +++ b/data/process.js @@ -2,7 +2,7 @@ const fs = require('node:fs'); const JSONStream = require('JSONStream'); const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku -const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for messages chains which have NO reactions +const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions const USERNAMES = [ 'vinny volcano\uD83C\uDF0B (伊胜焱)', 'Server Comp!', @@ -33,6 +33,7 @@ async function main() { let convoMsgSeqCount = 0; let convoReactCount = 0; let convoMsgs = []; + let convoRefs = {}; stream.on('data', async (msg) => { // no bot/system messages @@ -40,18 +41,6 @@ async function main() { return; } - /** - * Replies are a tricky case. I considered pasting their context in, except this - * requires keeping a full message cache and won't scale. Another possibility is - * to maintain a small sliding window of message history and delete replies which - * reference a message too far in the past... but what if that reply gets replied - * to right after? Our chat in particular has a lot of these "necro" messages, but - * since they tend to spark further discussion if anything, they probably don't - * noticeably obfuscate the flow of conversation compared to normal time skips, - * which our model is incapable of picking up in the first place. - * TLDR: Keep the replies. Making too many assumptions is bad. - */ - // scrub links let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); // scrub @mentions @@ -62,6 +51,18 @@ async function main() { return; } + /** + * Handle replies by maintaining a sliding window of message references. + * If the replied-to message is too old to be part of this conversation, then leave this + * message alone. + * If it's recent, then embed it as context for this message, using the old-fashioned + * reply syntax: "> original message \n reply message" + */ + if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { + const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n'); + cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`; + } + // count reaction convoReactCount += msg.reactions.length; @@ -87,14 +88,16 @@ async function main() { } convoMsgSeqCount = convoReactCount = 0; convoMsgs = []; + convoRefs = {}; } // follow chatML chat template const outMsg = { - role: botAuthoredMsgSequence ? "assistant" : msg.author.name, + role: botAuthoredMsgSequence ? "assistant" : "user", content: cleanContent }; convoMsgs.push(outMsg); + convoRefs[msg.id] = cleanContent; if (++counter % 1000 === 0) { console.log(counter + " messages written");