MikuAI/data/process.js

const fs = require('node:fs');
const JSONStream = require('JSONStream');

const MIKU_FREQ = 5;   // 1/5 = 20% of message chains are randomly chosen to be from Miku

async function main() {
    let counter = 0;
    try {
        await fsProm.unlink('output.json');
    } catch {}

    const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
    let lastMsgAuthor;
    let lastMsgTime;
    let botAuthoredMsgSequence;
    let convoMsgSeqCount = 0;
    let convoMsgs = [];

    stream.on('data', async (msg) => {
        // no bot/system messages
        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
            return;
        }

        /**
         * Replies are a tricky case. I considered pasting their context in, except this
         * requires keeping a full message cache and won't scale. Another possibility is
         * to maintain a small sliding window of message history and delete replies which
         * reference a message too far in the past... but what if that reply gets replied
         * to right after? Our chat in particular has a lot of these "necro" messages, but
         * since they tend to spark further discussion if anything, they probably don't
         * noticeably obfuscate the flow of conversation compared to normal time skips,
         * which our model is incapable of picking up in the first place.
         * TLDR: Keep the replies. Making too many assumptions is bad.
         */

        // scrub links
        const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
        if (!cleanContent) {
            return;
        }

        // determine continuity of message sequences
        let msgTime = new Date(msg.timestamp);
        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
            lastMsgAuthor = msg.author.id;

            // bot will pretend to author a random number of msg sequences
            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;

            ++convoMsgSeqCount;
        }
        lastMsgTime = msgTime;

        // 10 msg sequences per "conversation"
        if (convoMsgSeqCount === 10) {
            // write JSONL format
            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
            convoMsgSeqCount = 0;
            convoMsgs = [];
        }

        // follow chatML chat template
        const outMsg = {
            role: botAuthoredMsgSequence ? "assistant" : "user",
            content: cleanContent
        };
        convoMsgs.push(outMsg);

        if (++counter % 1000 === 0) {
            console.log(counter + " messages written");
        }
    });

    stream.on('close', async () => {
        if (convoMsgs.length) {
            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
        }
        console.log("Done!");
    });
}

main();
JSONL generation script from DiscordChatExporter 2024-04-06 01:38:46 -07:00			`const fs = require('node:fs');`
			`const JSONStream = require('JSONStream');`

			`const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku`

			`async function main() {`
			`let counter = 0;`
			`try {`
			`await fsProm.unlink('output.json');`
			`} catch {}`

			`const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));`
			`let lastMsgAuthor;`
			`let lastMsgTime;`
			`let botAuthoredMsgSequence;`
			`let convoMsgSeqCount = 0;`
			`let convoMsgs = [];`

			`stream.on('data', async (msg) => {`
			`// no bot/system messages`
			`if (msg.author.isBot \|\| (msg.type !== "Default" && msg.type !== "Reply")) {`
			`return;`
			`}`

			`/**`
			`* Replies are a tricky case. I considered pasting their context in, except this`
			`* requires keeping a full message cache and won't scale. Another possibility is`
			`* to maintain a small sliding window of message history and delete replies which`
			`* reference a message too far in the past... but what if that reply gets replied`
			`* to right after? Our chat in particular has a lot of these "necro" messages, but`
			`* since they tend to spark further discussion if anything, they probably don't`
			`* noticeably obfuscate the flow of conversation compared to normal time skips,`
			`* which our model is incapable of picking up in the first place.`
			`* TLDR: Keep the replies. Making too many assumptions is bad.`
			`*/`

			`// scrub links`
			`const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');`
			`if (!cleanContent) {`
			`return;`
			`}`

			`// determine continuity of message sequences`
			`let msgTime = new Date(msg.timestamp);`
			`if (lastMsgAuthor !== msg.author.id \|\| (msgTime - lastMsgTime)/60000 >= 7) {`
			`lastMsgAuthor = msg.author.id;`

			`// bot will pretend to author a random number of msg sequences`
			`botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;`

			`++convoMsgSeqCount;`
			`}`
			`lastMsgTime = msgTime;`

			`// 10 msg sequences per "conversation"`
			`if (convoMsgSeqCount === 10) {`
			`// write JSONL format`
			`fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');`
			`convoMsgSeqCount = 0;`
			`convoMsgs = [];`
			`}`

			`// follow chatML chat template`
			`const outMsg = {`
			`role: botAuthoredMsgSequence ? "assistant" : "user",`
			`content: cleanContent`
			`};`
			`convoMsgs.push(outMsg);`

			`if (++counter % 1000 === 0) {`
			`console.log(counter + " messages written");`
			`}`
			`});`

			`stream.on('close', async () => {`
			`if (convoMsgs.length) {`
			`fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');`
			`}`
			`console.log("Done!");`
			`});`
			`}`

			`main();`