MikuAI/data/process.js

116 lines
3.8 KiB
JavaScript

const fs = require('node:fs');
const JSONStream = require('JSONStream');
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions
const USERNAMES = [
'vinny volcano\uD83C\uDF0B (伊胜焱)',
'Server Comp!',
'Make The Map \uD83D\uDDFA',
'1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
'Hatsune Miku',
'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
'shibe.mp4❄☃',
'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
'owner',
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
'Nicolaid',
'epbic',
];
async function main() {
let counter = 0;
try {
await fsProm.unlink('output.json');
} catch {}
const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
let lastMsgAuthor;
let lastMsgTime;
let botAuthoredMsgSequence;
let convoMsgSeqCount = 0;
let convoReactCount = 0;
let convoMsgs = [];
let convoRefs = {};
stream.on('data', async (msg) => {
// no bot/system messages
if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
return;
}
// scrub links
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
// scrub @mentions
for (const username of USERNAMES) {
cleanContent = cleanContent.replaceAll(`@${username}`, "");
}
if (!cleanContent) {
return;
}
/**
* Handle replies by maintaining a sliding window of message references.
* If the replied-to message is too old to be part of this conversation, then leave this
* message alone.
* If it's recent, then embed it as context for this message, using the old-fashioned
* reply syntax: "> original message \n reply message"
*/
if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
}
// count reaction
convoReactCount += msg.reactions.length;
// determine continuity of message sequences
let msgTime = new Date(msg.timestamp);
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
lastMsgAuthor = msg.author.id;
// bot will pretend to author a random number of msg sequences
botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
++convoMsgSeqCount;
}
lastMsgTime = msgTime;
// 10 msg sequences per "conversation"
if (convoMsgSeqCount === 10) {
// dropout
const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
if (convoKeep) {
// write JSONL format
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
}
convoMsgSeqCount = convoReactCount = 0;
convoMsgs = [];
convoRefs = {};
}
// follow chatML chat template
const outMsg = {
role: botAuthoredMsgSequence ? "assistant" : "user",
content: cleanContent
};
convoMsgs.push(outMsg);
convoRefs[msg.id] = cleanContent;
if (++counter % 1000 === 0) {
console.log(counter + " messages written");
}
});
stream.on('close', async () => {
if (convoMsgs.length) {
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
}
console.log("Done!");
});
}
main();