const fs = require('node:fs'); const JSONStream = require('JSONStream'); const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions const SAVED_USERNAMES = [ // usernames as they were recorded in the chat log (@nickname) 'vinny volcano\uD83C\uDF0B (伊胜焱)', 'Server Comp!', 'Make The Map \uD83D\uDDFA', '1981 Celical Man\uD83C\uDF41\uD83C\uDF42', 'Hatsune Miku', 'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42', 'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42', 'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱', 'shibe.mp4❄☃', 'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42', 'owner', 'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42', 'Nicolaid', 'epbic', 'Cap’n Vincent 🏴☠🏝', '1715 Galleonpilled Skipchud ⚓🦜', 'me gold doubloons🏴☠🏆', 'Boatswain Samuel ⚓⛵ 🌊' ]; const REAL_NAMES = { // username to real name mapping 'vinso1445': 'Vincent Iannelli', 'scoliono': 'James Shiffer', 'gnuwu': 'David Zheng', 'f0oby': 'Myles Linden', 'bapazheng': 'Myles Linden', 'bapabakshi': 'Myles Linden', 'keliande27': 'Myles Linden', '1thinker': 'Samuel Habib', 'adam28405': 'Adam Kazerounian', 'shibe.mp4': 'Jake Wong' }; async function main() { let counter = 0; try { await fsProm.unlink('output.json'); } catch {} const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*')); let lastMsgAuthor; let lastMsgTime; let botAuthoredMsgSequence; let convoMsgSeqCount = 0; let convoReactCount = 0; let promptMsg = []; // group of formatted msg seqs to be written to one line of the final dataset let discordMsgs = []; // a single formatted message sequence let convoRefs = {}; /** * Langchain structured output * * Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so: * * ``` * USER: * Answer the user query. * [ Langchain JSON structured output instructions ] * { ... "author": "vinso1445", "content": "message history 1" ... } * { ... "author": "f0oby", "content": "message history 2" ... } * { ... "author": "scoliono", "content": "message history 3" ... } * * * ASSISTANT: * { ... "author": "Hatsune Miku", "content": "message history 1" ... } * ``` * * To this end, we have a function to format Discord messages in the same format as the * Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.) * * Each turn by the user or assistant in the LLM-level conversation is henceforth known as a * "prompt message". The individual JSON lines in this example are supposed to represent * Discord messages, with one prompt message containing a "message sequence"'s worth. In the * actual JSONL dataset, though, one line represents 10 message sequences. * * Note: the training data will sometimes have multiple Discord messages in a single assistant * message sequence. Although it may seem unorthodox to have an LLM double-text you, this is * supposed to emulate a real Discord conversation, and real people have a tendency to split up * a thought across multiple messages. It's up to the inference code to decide what to do with * this. */ function structurePrompt(msg, cleanContent, isBotMessage = false) { /** * Handle replies by maintaining a sliding window of message references. * If the replied-to message is too old to be part of this conversation, then leave this * message alone. If it's recent, then embed it as context for this message. */ let repliedToContent; if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { repliedToContent = convoRefs[msg.reference.messageId]; } // record reactions the message got in a compact string form let reactionString; for (const reaction of msg.reactions) { if (reactionString === undefined) { reactionString = ''; } if (reactionString && reactionString.length > 0) { reactionString += ', '; } reactionString += `:${reaction.emoji.code}: (${reaction.count})`; } // 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted return JSON.stringify({ timestamp: (new Date(msg.timestamp)).toUTCString(), author: isBotMessage ? 'Hatsune Miku#1740' : msg.author.name, name: isBotMessage ? 'Hatsune Miku' : REAL_NAMES[msg.author.name], context: repliedToContent, content: cleanContent, reactions: reactionString }); } stream.on('data', async (msg) => { // no bot/system messages if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { return; } // scrub links let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); // scrub @mentions for (const username of SAVED_USERNAMES) { cleanContent = cleanContent.replaceAll(`@${username}`, ""); } if (!cleanContent) { return; } // count reaction convoReactCount += msg.reactions.length; // determine continuity of message sequences let msgTime = new Date(msg.timestamp); if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { lastMsgAuthor = msg.author.id; // follow chatML chat template when writing to prompt promptMsg.push({ role: botAuthoredMsgSequence ? 'assistant' : 'user', content: discordMsgs.join('\n') }); discordMsgs = []; // bot will pretend to author a random number of msg sequences botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; ++convoMsgSeqCount; } lastMsgTime = msgTime; // 10 msg sequences per prompt message if (convoMsgSeqCount === 10) { // dropout const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY; if (convoKeep) { // write JSONL format fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n'); } convoMsgSeqCount = convoReactCount = 0; promptMsg = []; discordMsgs = []; convoRefs = {}; } convoRefs[msg.id] = cleanContent; // write a single discord message to the prompt discordMsgs.push(structurePrompt(msg, cleanContent, botAuthoredMsgSequence)); if (++counter % 1000 === 0) { console.log(counter + " messages written"); } }); stream.on('close', async () => { if (promptMsg.length) { fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n'); } console.log("Done!"); }); } main();