MikuAI/data/process.js

const fs = require('node:fs');
const JSONStream = require('JSONStream');

const MIKU_FREQ = 5;   // 1/5 = 20% of message chains are randomly chosen to be from Miku

async function main() {
    let counter = 0;
    try {
        await fsProm.unlink('output.json');
    } catch {}

    const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
    let lastMsgAuthor;
    let lastMsgTime;
    let botAuthoredMsgSequence;
    let convoMsgSeqCount = 0;
    let convoMsgs = [];

    stream.on('data', async (msg) => {
        // no bot/system messages
        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
            return;
        }

        /**
         * Replies are a tricky case. I considered pasting their context in, except this
         * requires keeping a full message cache and won't scale. Another possibility is
         * to maintain a small sliding window of message history and delete replies which
         * reference a message too far in the past... but what if that reply gets replied
         * to right after? Our chat in particular has a lot of these "necro" messages, but
         * since they tend to spark further discussion if anything, they probably don't
         * noticeably obfuscate the flow of conversation compared to normal time skips,
         * which our model is incapable of picking up in the first place.
         * TLDR: Keep the replies. Making too many assumptions is bad.
         */

        // scrub links
        const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
        if (!cleanContent) {
            return;
        }

        // determine continuity of message sequences
        let msgTime = new Date(msg.timestamp);
        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
            lastMsgAuthor = msg.author.id;

            // bot will pretend to author a random number of msg sequences
            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;

            ++convoMsgSeqCount;
        }
        lastMsgTime = msgTime;

        // 10 msg sequences per "conversation"
        if (convoMsgSeqCount === 10) {
            // write JSONL format
            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
            convoMsgSeqCount = 0;
            convoMsgs = [];
        }

        // follow chatML chat template
        const outMsg = {
            role: botAuthoredMsgSequence ? "assistant" : "user",
            content: cleanContent
        };
        convoMsgs.push(outMsg);

        if (++counter % 1000 === 0) {
            console.log(counter + " messages written");
        }
    });

    stream.on('close', async () => {
        if (convoMsgs.length) {
            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
        }
        console.log("Done!");
    });
}

main();