MikuAI/data/process.js

195 lines
7.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const fs = require('node:fs');
const JSONStream = require('JSONStream');
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions
const SAVED_USERNAMES = [ // usernames as they were recorded in the chat log (@nickname)
'vinny volcano\uD83C\uDF0B (伊胜焱)',
'Server Comp!',
'Make The Map \uD83D\uDDFA',
'1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
'Hatsune Miku',
'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
'shibe.mp4❄☃',
'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
'owner',
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
'Nicolaid',
'epbic',
'Capn Vincent 🏴☠🏝',
'1715 Galleonpilled Skipchud ⚓🦜',
'me gold doubloons🏴☠🏆',
'Boatswain Samuel ⚓⛵ 🌊'
];
const REAL_NAMES = { // username to real name mapping
'vinso1445': 'Vincent Iannelli',
'scoliono': 'James Shiffer',
'gnuwu': 'David Zheng',
'f0oby': 'Myles Linden',
'bapazheng': 'Myles Linden',
'bapabakshi': 'Myles Linden',
'keliande27': 'Myles Linden',
'1thinker': 'Samuel Habib',
'adam28405': 'Adam Kazerounian',
'shibe.mp4': 'Jake Wong'
};
async function main() {
let counter = 0;
try {
await fsProm.unlink('output.json');
} catch {}
const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
let lastMsgAuthor;
let lastMsgTime;
let botAuthoredMsgSequence;
let convoMsgSeqCount = 0;
let convoReactCount = 0;
let promptMsg = []; // group of formatted msg seqs to be written to one line of the final dataset
let discordMsgs = []; // a single formatted message sequence
let convoRefs = {};
/**
* Langchain structured output
*
* Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so:
*
* ```
* USER:
* Answer the user query.
* [ Langchain JSON structured output instructions ]
* { ... "author": "vinso1445", "content": "message history 1" ... }
* { ... "author": "f0oby", "content": "message history 2" ... }
* { ... "author": "scoliono", "content": "message history 3" ... }
*
*
* ASSISTANT:
* { ... "author": "Hatsune Miku", "content": "message history 1" ... }
* ```
*
* To this end, we have a function to format Discord messages in the same format as the
* Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.)
*
* Each turn by the user or assistant in the LLM-level conversation is henceforth known as a
* "prompt message". The individual JSON lines in this example are supposed to represent
* Discord messages, with one prompt message containing a "message sequence"'s worth. In the
* actual JSONL dataset, though, one line represents 10 message sequences.
*
* Note: the training data will sometimes have multiple Discord messages in a single assistant
* message sequence. Although it may seem unorthodox to have an LLM double-text you, this is
* supposed to emulate a real Discord conversation, and real people have a tendency to split up
* a thought across multiple messages. It's up to the inference code to decide what to do with
* this.
*/
function structurePrompt(msg, cleanContent, isBotMessage = false) {
/**
* Handle replies by maintaining a sliding window of message references.
* If the replied-to message is too old to be part of this conversation, then leave this
* message alone. If it's recent, then embed it as context for this message.
*/
let repliedToContent;
if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
repliedToContent = convoRefs[msg.reference.messageId];
}
// record reactions the message got in a compact string form
let reactionString;
for (const reaction of msg.reactions) {
if (reactionString === undefined) {
reactionString = '';
}
if (reactionString && reactionString.length > 0) {
reactionString += ', ';
}
reactionString += `:${reaction.emoji.code}: (${reaction.count})`;
}
// 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
return JSON.stringify({
timestamp: (new Date(msg.timestamp)).toUTCString(),
author: isBotMessage ? 'Hatsune Miku#1740' : msg.author.name,
name: isBotMessage ? 'Hatsune Miku' : REAL_NAMES[msg.author.name],
context: repliedToContent,
content: cleanContent,
reactions: reactionString
});
}
stream.on('data', async (msg) => {
// no bot/system messages
if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
return;
}
// scrub links
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
// scrub @mentions
for (const username of SAVED_USERNAMES) {
cleanContent = cleanContent.replaceAll(`@${username}`, "");
}
if (!cleanContent) {
return;
}
// count reaction
convoReactCount += msg.reactions.length;
// determine continuity of message sequences
let msgTime = new Date(msg.timestamp);
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
lastMsgAuthor = msg.author.id;
// follow chatML chat template when writing to prompt
promptMsg.push({
role: botAuthoredMsgSequence ? 'assistant' : 'user',
content: discordMsgs.join('\n')
});
discordMsgs = [];
// bot will pretend to author a random number of msg sequences
botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
++convoMsgSeqCount;
}
lastMsgTime = msgTime;
// 10 msg sequences per prompt message
if (convoMsgSeqCount === 10) {
// dropout
const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
if (convoKeep) {
// write JSONL format
fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
}
convoMsgSeqCount = convoReactCount = 0;
promptMsg = [];
discordMsgs = [];
convoRefs = {};
}
convoRefs[msg.id] = cleanContent;
// write a single discord message to the prompt
discordMsgs.push(structurePrompt(msg, cleanContent, botAuthoredMsgSequence));
if (++counter % 1000 === 0) {
console.log(counter + " messages written");
}
});
stream.on('close', async () => {
if (promptMsg.length) {
fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
}
console.log("Done!");
});
}
main();