const fs = require('node:fs'); const JSONStream = require('JSONStream'); const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku async function main() { let counter = 0; try { await fsProm.unlink('output.json'); } catch {} const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*')); let lastMsgAuthor; let lastMsgTime; let botAuthoredMsgSequence; let convoMsgSeqCount = 0; let convoMsgs = []; stream.on('data', async (msg) => { // no bot/system messages if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { return; } /** * Replies are a tricky case. I considered pasting their context in, except this * requires keeping a full message cache and won't scale. Another possibility is * to maintain a small sliding window of message history and delete replies which * reference a message too far in the past... but what if that reply gets replied * to right after? Our chat in particular has a lot of these "necro" messages, but * since they tend to spark further discussion if anything, they probably don't * noticeably obfuscate the flow of conversation compared to normal time skips, * which our model is incapable of picking up in the first place. * TLDR: Keep the replies. Making too many assumptions is bad. */ // scrub links const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); if (!cleanContent) { return; } // determine continuity of message sequences let msgTime = new Date(msg.timestamp); if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { lastMsgAuthor = msg.author.id; // bot will pretend to author a random number of msg sequences botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; ++convoMsgSeqCount; } lastMsgTime = msgTime; // 10 msg sequences per "conversation" if (convoMsgSeqCount === 10) { // write JSONL format fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); convoMsgSeqCount = 0; convoMsgs = []; } // follow chatML chat template const outMsg = { role: botAuthoredMsgSequence ? "assistant" : "user", content: cleanContent }; convoMsgs.push(outMsg); if (++counter % 1000 === 0) { console.log(counter + " messages written"); } }); stream.on('close', async () => { if (convoMsgs.length) { fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); } console.log("Done!"); }); } main();