MikuAI/data/process.js

const fs = require('node:fs');
const JSONStream = require('JSONStream');

const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
const SAVED_USERNAMES = [       // usernames as they were recorded in the chat log (@nickname)
    'vinny volcano\uD83C\uDF0B (伊胜焱)',
    'Server Comp!',
    'Make The Map \uD83D\uDDFA',
    '1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
    'Hatsune Miku',
    'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
    'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
    'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
    'shibe.mp4❄☃',
    'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
    'owner',
    'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
    'Nicolaid',
    'epbic',
    'Cap’n Vincent 🏴☠🏝',
    '1715 Galleonpilled Skipchud ⚓🦜',
    'me gold doubloons🏴☠🏆',
    'Boatswain Samuel ⚓⛵ 🌊'
];
const REAL_NAMES = {        // username to real name mapping
    'vinso1445': 'Vincent Iannelli',
    'scoliono': 'James Shiffer',
    'gnuwu': 'David Zheng',
    'f0oby': 'Myles Linden',
    'bapazheng': 'Myles Linden',
    'bapabakshi': 'Myles Linden',
    'keliande27': 'Myles Linden',
    '1thinker': 'Samuel Habib',
    'adam28405': 'Adam Kazerounian',
    'shibe.mp4': 'Jake Wong'
};

async function main() {
    let counter = 0;
    try {
        await fsProm.unlink('output.json');
    } catch {}

    const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
    let lastMsgAuthor;
    let lastMsgTime;
    let botAuthoredMsgSequence;
    let convoMsgSeqCount = 0;
    let convoReactCount = 0;
    let promptMsg = [];     // group of formatted msg seqs to be written to one line of the final dataset
    let discordMsgs = [];    // a single formatted message sequence
    let convoRefs = {};


    /**
     * Langchain structured output
     *
     * Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so:
     *
     * ```
     * USER:
     * Answer the user query.
     * [ Langchain JSON structured output instructions ]
     * { ... "author": "vinso", "content": "message history 1" ... }
     * { ... "author": "f0oby", "content": "message history 2" ... }
     * { ... "author": "scoliono", "content": "message history 3" ... }
     *
     *
     * ASSISTANT:
     * { ... "author": "Hatsune Miku", "content": "message history 1" ... }
     * ```
     *
     * To this end, we have a function to format Discord messages in the same format as the
     * Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.)
     *
     * Each turn by the user or assistant in the LLM-level conversation is henceforth known as a
     * "prompt message". The individual JSON lines in this example are supposed to represent
     * Discord messages, with one prompt message containing a "message sequence"'s worth. In the
     * actual JSONL dataset, though, one line represents 10 message sequences.
     *
     * Note: the training data will sometimes have multiple Discord messages in a single assistant
     * message sequence. Although it may seem unorthodox to have an LLM double-text you, this is
     * supposed to emulate a real Discord conversation, and real people have a tendency to split up
     * a thought across multiple messages. It's up to the inference code to decide what to do with
     * this.
     */
    function structurePrompt(msg, cleanContent) {
        /**
         * Handle replies by maintaining a sliding window of message references.
         * If the replied-to message is too old to be part of this conversation, then leave this
         * message alone. If it's recent, then embed it as context for this message.
         */
        let repliedToContent;
        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
            repliedToContent = convoRefs[msg.reference.messageId];
        }
        // record reactions the message got in a compact string form
        let reactionString;
        for (const reaction of msg.reactions) {
            if (reactionString === undefined) {
                reactionString = '';
            }
            if (reactionString && reactionString.length > 0) {
                reactionString += ', ';
            }
            reactionString += `:${reaction.emoji.code}: (${reaction.count})`;
        }

        // 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
        return JSON.stringify({
            timestamp: (new Date(msg.timestamp)).toUTCString(),
            author: msg.author.name,
            name: REAL_NAMES[msg.author.name],
            context: repliedToContent,
            content: cleanContent,
            reactions: reactionString
        });
    }


    stream.on('data', async (msg) => {
        // no bot/system messages
        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
            return;
        }

        // scrub links
        let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
        // scrub @mentions
        for (const username of SAVED_USERNAMES) {
            cleanContent = cleanContent.replaceAll(`@${username}`, "");
        }
        if (!cleanContent) {
            return;
        }

        // count reaction
        convoReactCount += msg.reactions.length;

        // determine continuity of message sequences
        let msgTime = new Date(msg.timestamp);
        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
            lastMsgAuthor = msg.author.id;

            // follow chatML chat template when writing to prompt
            promptMsg.push({
                role: botAuthoredMsgSequence ? 'assistant' : 'user',
                content: discordMsgs.join('\n')
            });

            discordMsgs = [];

            // bot will pretend to author a random number of msg sequences
            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;

            ++convoMsgSeqCount;
        }
        lastMsgTime = msgTime;

        // 10 msg sequences per prompt message
        if (convoMsgSeqCount === 10) {
            // dropout
            const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
            if (convoKeep) {
                // write JSONL format
                fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
            }
            convoMsgSeqCount = convoReactCount = 0;
            promptMsg = [];
            discordMsgs = [];
            convoRefs = {};
        }

        convoRefs[msg.id] = cleanContent;

        // write a single discord message to the prompt
        discordMsgs.push(structurePrompt(msg, cleanContent));

        if (++counter % 1000 === 0) {
            console.log(counter + " messages written");
        }
    });


    stream.on('close', async () => {
        if (promptMsg.length) {
            fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
        }
        console.log("Done!");
    });
}

main();