2024-04-06 01:38:46 -07:00
|
|
|
|
const fs = require('node:fs');
|
|
|
|
|
const JSONStream = require('JSONStream');
|
|
|
|
|
|
2024-05-25 04:35:24 +00:00
|
|
|
|
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
|
2024-11-18 23:25:18 +00:00
|
|
|
|
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions
|
2025-01-16 16:32:57 -08:00
|
|
|
|
const SAVED_USERNAMES = [ // usernames as they were recorded in the chat log (@nickname)
|
2024-05-25 04:35:24 +00:00
|
|
|
|
'vinny volcano\uD83C\uDF0B (伊胜焱)',
|
|
|
|
|
'Server Comp!',
|
|
|
|
|
'Make The Map \uD83D\uDDFA',
|
|
|
|
|
'1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
|
'Hatsune Miku',
|
|
|
|
|
'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
|
'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
|
'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
|
|
|
|
|
'shibe.mp4❄☃',
|
|
|
|
|
'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
|
'owner',
|
|
|
|
|
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
|
'Nicolaid',
|
|
|
|
|
'epbic',
|
2025-01-16 16:32:57 -08:00
|
|
|
|
'Cap’n Vincent 🏴☠🏝',
|
|
|
|
|
'1715 Galleonpilled Skipchud ⚓🦜',
|
|
|
|
|
'me gold doubloons🏴☠🏆',
|
|
|
|
|
'Boatswain Samuel ⚓⛵ 🌊'
|
2024-05-25 04:35:24 +00:00
|
|
|
|
];
|
2025-01-16 16:32:57 -08:00
|
|
|
|
const REAL_NAMES = { // username to real name mapping
|
|
|
|
|
'vinso1445': 'Vincent Iannelli',
|
|
|
|
|
'scoliono': 'James Shiffer',
|
|
|
|
|
'gnuwu': 'David Zheng',
|
|
|
|
|
'f0oby': 'Myles Linden',
|
|
|
|
|
'bapazheng': 'Myles Linden',
|
|
|
|
|
'bapabakshi': 'Myles Linden',
|
|
|
|
|
'keliande27': 'Myles Linden',
|
|
|
|
|
'1thinker': 'Samuel Habib',
|
|
|
|
|
'adam28405': 'Adam Kazerounian',
|
|
|
|
|
'shibe.mp4': 'Jake Wong'
|
|
|
|
|
};
|
2024-04-06 01:38:46 -07:00
|
|
|
|
|
|
|
|
|
async function main() {
|
|
|
|
|
let counter = 0;
|
|
|
|
|
try {
|
|
|
|
|
await fsProm.unlink('output.json');
|
|
|
|
|
} catch {}
|
|
|
|
|
|
|
|
|
|
const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
|
|
|
|
|
let lastMsgAuthor;
|
|
|
|
|
let lastMsgTime;
|
|
|
|
|
let botAuthoredMsgSequence;
|
|
|
|
|
let convoMsgSeqCount = 0;
|
2024-05-25 04:35:24 +00:00
|
|
|
|
let convoReactCount = 0;
|
2025-01-16 16:32:57 -08:00
|
|
|
|
let promptMsg = []; // group of formatted msg seqs to be written to one line of the final dataset
|
|
|
|
|
let discordMsgs = []; // a single formatted message sequence
|
2024-11-18 23:25:18 +00:00
|
|
|
|
let convoRefs = {};
|
2024-04-06 01:38:46 -07:00
|
|
|
|
|
2025-01-16 16:32:57 -08:00
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Langchain structured output
|
|
|
|
|
*
|
|
|
|
|
* Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so:
|
|
|
|
|
*
|
|
|
|
|
* ```
|
|
|
|
|
* USER:
|
|
|
|
|
* Answer the user query.
|
|
|
|
|
* [ Langchain JSON structured output instructions ]
|
2025-01-18 18:44:26 -08:00
|
|
|
|
* { ... "author": "vinso1445", "content": "message history 1" ... }
|
2025-01-16 16:32:57 -08:00
|
|
|
|
* { ... "author": "f0oby", "content": "message history 2" ... }
|
|
|
|
|
* { ... "author": "scoliono", "content": "message history 3" ... }
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* ASSISTANT:
|
|
|
|
|
* { ... "author": "Hatsune Miku", "content": "message history 1" ... }
|
|
|
|
|
* ```
|
|
|
|
|
*
|
|
|
|
|
* To this end, we have a function to format Discord messages in the same format as the
|
|
|
|
|
* Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.)
|
|
|
|
|
*
|
|
|
|
|
* Each turn by the user or assistant in the LLM-level conversation is henceforth known as a
|
|
|
|
|
* "prompt message". The individual JSON lines in this example are supposed to represent
|
|
|
|
|
* Discord messages, with one prompt message containing a "message sequence"'s worth. In the
|
|
|
|
|
* actual JSONL dataset, though, one line represents 10 message sequences.
|
|
|
|
|
*
|
|
|
|
|
* Note: the training data will sometimes have multiple Discord messages in a single assistant
|
|
|
|
|
* message sequence. Although it may seem unorthodox to have an LLM double-text you, this is
|
|
|
|
|
* supposed to emulate a real Discord conversation, and real people have a tendency to split up
|
|
|
|
|
* a thought across multiple messages. It's up to the inference code to decide what to do with
|
|
|
|
|
* this.
|
|
|
|
|
*/
|
2025-01-18 18:44:26 -08:00
|
|
|
|
function structurePrompt(msg, cleanContent, isBotMessage = false) {
|
2025-01-16 16:32:57 -08:00
|
|
|
|
/**
|
|
|
|
|
* Handle replies by maintaining a sliding window of message references.
|
|
|
|
|
* If the replied-to message is too old to be part of this conversation, then leave this
|
|
|
|
|
* message alone. If it's recent, then embed it as context for this message.
|
|
|
|
|
*/
|
|
|
|
|
let repliedToContent;
|
|
|
|
|
if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
|
|
|
|
|
repliedToContent = convoRefs[msg.reference.messageId];
|
|
|
|
|
}
|
|
|
|
|
// record reactions the message got in a compact string form
|
|
|
|
|
let reactionString;
|
|
|
|
|
for (const reaction of msg.reactions) {
|
|
|
|
|
if (reactionString === undefined) {
|
|
|
|
|
reactionString = '';
|
|
|
|
|
}
|
|
|
|
|
if (reactionString && reactionString.length > 0) {
|
|
|
|
|
reactionString += ', ';
|
|
|
|
|
}
|
|
|
|
|
reactionString += `:${reaction.emoji.code}: (${reaction.count})`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
|
|
|
|
|
return JSON.stringify({
|
|
|
|
|
timestamp: (new Date(msg.timestamp)).toUTCString(),
|
2025-01-18 18:44:26 -08:00
|
|
|
|
author: isBotMessage ? 'Hatsune Miku#1740' : msg.author.name,
|
|
|
|
|
name: isBotMessage ? 'Hatsune Miku' : REAL_NAMES[msg.author.name],
|
2025-01-16 16:32:57 -08:00
|
|
|
|
context: repliedToContent,
|
|
|
|
|
content: cleanContent,
|
|
|
|
|
reactions: reactionString
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-04-06 01:38:46 -07:00
|
|
|
|
stream.on('data', async (msg) => {
|
|
|
|
|
// no bot/system messages
|
|
|
|
|
if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// scrub links
|
2024-05-25 04:35:24 +00:00
|
|
|
|
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
|
|
|
|
|
// scrub @mentions
|
2025-01-16 16:32:57 -08:00
|
|
|
|
for (const username of SAVED_USERNAMES) {
|
2024-05-25 04:35:24 +00:00
|
|
|
|
cleanContent = cleanContent.replaceAll(`@${username}`, "");
|
|
|
|
|
}
|
2024-04-06 01:38:46 -07:00
|
|
|
|
if (!cleanContent) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-25 04:35:24 +00:00
|
|
|
|
// count reaction
|
|
|
|
|
convoReactCount += msg.reactions.length;
|
|
|
|
|
|
2024-04-06 01:38:46 -07:00
|
|
|
|
// determine continuity of message sequences
|
|
|
|
|
let msgTime = new Date(msg.timestamp);
|
|
|
|
|
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
|
|
|
|
|
lastMsgAuthor = msg.author.id;
|
|
|
|
|
|
2025-01-16 16:32:57 -08:00
|
|
|
|
// follow chatML chat template when writing to prompt
|
|
|
|
|
promptMsg.push({
|
|
|
|
|
role: botAuthoredMsgSequence ? 'assistant' : 'user',
|
|
|
|
|
content: discordMsgs.join('\n')
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
discordMsgs = [];
|
|
|
|
|
|
2024-04-06 01:38:46 -07:00
|
|
|
|
// bot will pretend to author a random number of msg sequences
|
|
|
|
|
botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
|
|
|
|
|
|
|
|
|
|
++convoMsgSeqCount;
|
|
|
|
|
}
|
|
|
|
|
lastMsgTime = msgTime;
|
|
|
|
|
|
2025-01-16 16:32:57 -08:00
|
|
|
|
// 10 msg sequences per prompt message
|
2024-04-06 01:38:46 -07:00
|
|
|
|
if (convoMsgSeqCount === 10) {
|
2024-05-25 04:35:24 +00:00
|
|
|
|
// dropout
|
|
|
|
|
const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
|
|
|
|
|
if (convoKeep) {
|
|
|
|
|
// write JSONL format
|
2025-01-16 16:32:57 -08:00
|
|
|
|
fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
|
2024-05-25 04:35:24 +00:00
|
|
|
|
}
|
|
|
|
|
convoMsgSeqCount = convoReactCount = 0;
|
2025-01-16 16:32:57 -08:00
|
|
|
|
promptMsg = [];
|
|
|
|
|
discordMsgs = [];
|
2024-11-18 23:25:18 +00:00
|
|
|
|
convoRefs = {};
|
2024-04-06 01:38:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
2024-11-18 23:25:18 +00:00
|
|
|
|
convoRefs[msg.id] = cleanContent;
|
2024-04-06 01:38:46 -07:00
|
|
|
|
|
2025-01-16 16:32:57 -08:00
|
|
|
|
// write a single discord message to the prompt
|
2025-01-18 18:44:26 -08:00
|
|
|
|
discordMsgs.push(structurePrompt(msg, cleanContent, botAuthoredMsgSequence));
|
2025-01-16 16:32:57 -08:00
|
|
|
|
|
2024-04-06 01:38:46 -07:00
|
|
|
|
if (++counter % 1000 === 0) {
|
|
|
|
|
console.log(counter + " messages written");
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
2025-01-16 16:32:57 -08:00
|
|
|
|
|
2024-04-06 01:38:46 -07:00
|
|
|
|
stream.on('close', async () => {
|
2025-01-16 16:32:57 -08:00
|
|
|
|
if (promptMsg.length) {
|
|
|
|
|
fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
|
2024-04-06 01:38:46 -07:00
|
|
|
|
}
|
|
|
|
|
console.log("Done!");
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
main();
|