const fs = require('node:fs'); const JSONStream = require('JSONStream'); const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions const USERNAMES = [ 'vinny volcano\uD83C\uDF0B (伊胜焱)', 'Server Comp!', 'Make The Map \uD83D\uDDFA', '1981 Celical Man\uD83C\uDF41\uD83C\uDF42', 'Hatsune Miku', 'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42', 'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42', 'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱', 'shibe.mp4❄☃', 'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42', 'owner', 'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42', 'Nicolaid', 'epbic', ]; async function main() { let counter = 0; try { await fsProm.unlink('output.json'); } catch {} const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*')); let lastMsgAuthor; let lastMsgTime; let botAuthoredMsgSequence; let convoMsgSeqCount = 0; let convoReactCount = 0; let convoMsgs = []; let convoRefs = {}; stream.on('data', async (msg) => { // no bot/system messages if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { return; } // scrub links let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); // scrub @mentions for (const username of USERNAMES) { cleanContent = cleanContent.replaceAll(`@${username}`, ""); } if (!cleanContent) { return; } /** * Handle replies by maintaining a sliding window of message references. * If the replied-to message is too old to be part of this conversation, then leave this * message alone. * If it's recent, then embed it as context for this message, using the old-fashioned * reply syntax: "> original message \n reply message" */ if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n'); cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`; } // count reaction convoReactCount += msg.reactions.length; // determine continuity of message sequences let msgTime = new Date(msg.timestamp); if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { lastMsgAuthor = msg.author.id; // bot will pretend to author a random number of msg sequences botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; ++convoMsgSeqCount; } lastMsgTime = msgTime; // 10 msg sequences per "conversation" if (convoMsgSeqCount === 10) { // dropout const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY; if (convoKeep) { // write JSONL format fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); } convoMsgSeqCount = convoReactCount = 0; convoMsgs = []; convoRefs = {}; } // follow chatML chat template const outMsg = { role: botAuthoredMsgSequence ? "assistant" : "user", content: cleanContent }; convoMsgs.push(outMsg); convoRefs[msg.id] = cleanContent; if (++counter % 1000 === 0) { console.log(counter + " messages written"); } }); stream.on('close', async () => { if (convoMsgs.length) { fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); } console.log("Done!"); }); } main();