New training data prep scripts

This commit is contained in:
James S 2025-01-16 16:32:57 -08:00
parent f755a03eb2
commit 8159b11f4f
2 changed files with 114 additions and 27 deletions

View File

@ -1,9 +1,17 @@
/**
* procToxicQA.js
* This script assumes you have ToxicQA (https://huggingface.co/datasets/NobodyExistsOnTheInternet/toxicqa/blob/main/finalToxicQA.jsonl)
* downloaded at 'toxicQA.json'.
*/
const fs = require('node:fs');
var lineReader = require('readline').createInterface({
input: fs.createReadStream('toxicQA.json')
});
var outstream = fs.createWriteStream('toxicQAfinal.json');
fs.unlinkSync('toxicQAfinal.json');
if (fs.existsSync('toxicQAfinal.json')) {
fs.unlinkSync('toxicQAfinal.json');
}
lineReader.on('line', function (line) {
const dialogue = JSON.parse(line)["conversations"];

View File

@ -3,7 +3,7 @@ const JSONStream = require('JSONStream');
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions
const USERNAMES = [
const SAVED_USERNAMES = [ // usernames as they were recorded in the chat log (@nickname)
'vinny volcano\uD83C\uDF0B (伊胜焱)',
'Server Comp!',
'Make The Map \uD83D\uDDFA',
@ -18,7 +18,23 @@ const USERNAMES = [
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
'Nicolaid',
'epbic',
'Capn Vincent 🏴☠🏝',
'1715 Galleonpilled Skipchud ⚓🦜',
'me gold doubloons🏴☠🏆',
'Boatswain Samuel ⚓⛵ 🌊'
];
const REAL_NAMES = { // username to real name mapping
'vinso1445': 'Vincent Iannelli',
'scoliono': 'James Shiffer',
'gnuwu': 'David Zheng',
'f0oby': 'Myles Linden',
'bapazheng': 'Myles Linden',
'bapabakshi': 'Myles Linden',
'keliande27': 'Myles Linden',
'1thinker': 'Samuel Habib',
'adam28405': 'Adam Kazerounian',
'shibe.mp4': 'Jake Wong'
};
async function main() {
let counter = 0;
@ -32,9 +48,77 @@ async function main() {
let botAuthoredMsgSequence;
let convoMsgSeqCount = 0;
let convoReactCount = 0;
let convoMsgs = [];
let promptMsg = []; // group of formatted msg seqs to be written to one line of the final dataset
let discordMsgs = []; // a single formatted message sequence
let convoRefs = {};
/**
* Langchain structured output
*
* Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so:
*
* ```
* USER:
* Answer the user query.
* [ Langchain JSON structured output instructions ]
* { ... "author": "vinso", "content": "message history 1" ... }
* { ... "author": "f0oby", "content": "message history 2" ... }
* { ... "author": "scoliono", "content": "message history 3" ... }
*
*
* ASSISTANT:
* { ... "author": "Hatsune Miku", "content": "message history 1" ... }
* ```
*
* To this end, we have a function to format Discord messages in the same format as the
* Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.)
*
* Each turn by the user or assistant in the LLM-level conversation is henceforth known as a
* "prompt message". The individual JSON lines in this example are supposed to represent
* Discord messages, with one prompt message containing a "message sequence"'s worth. In the
* actual JSONL dataset, though, one line represents 10 message sequences.
*
* Note: the training data will sometimes have multiple Discord messages in a single assistant
* message sequence. Although it may seem unorthodox to have an LLM double-text you, this is
* supposed to emulate a real Discord conversation, and real people have a tendency to split up
* a thought across multiple messages. It's up to the inference code to decide what to do with
* this.
*/
function structurePrompt(msg, cleanContent) {
/**
* Handle replies by maintaining a sliding window of message references.
* If the replied-to message is too old to be part of this conversation, then leave this
* message alone. If it's recent, then embed it as context for this message.
*/
let repliedToContent;
if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
repliedToContent = convoRefs[msg.reference.messageId];
}
// record reactions the message got in a compact string form
let reactionString;
for (const reaction of msg.reactions) {
if (reactionString === undefined) {
reactionString = '';
}
if (reactionString && reactionString.length > 0) {
reactionString += ', ';
}
reactionString += `:${reaction.emoji.code}: (${reaction.count})`;
}
// 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
return JSON.stringify({
timestamp: (new Date(msg.timestamp)).toUTCString(),
author: msg.author.name,
name: REAL_NAMES[msg.author.name],
context: repliedToContent,
content: cleanContent,
reactions: reactionString
});
}
stream.on('data', async (msg) => {
// no bot/system messages
if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
@ -44,25 +128,13 @@ async function main() {
// scrub links
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
// scrub @mentions
for (const username of USERNAMES) {
for (const username of SAVED_USERNAMES) {
cleanContent = cleanContent.replaceAll(`@${username}`, "");
}
if (!cleanContent) {
return;
}
/**
* Handle replies by maintaining a sliding window of message references.
* If the replied-to message is too old to be part of this conversation, then leave this
* message alone.
* If it's recent, then embed it as context for this message, using the old-fashioned
* reply syntax: "> original message \n reply message"
*/
if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
}
// count reaction
convoReactCount += msg.reactions.length;
@ -71,6 +143,14 @@ async function main() {
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
lastMsgAuthor = msg.author.id;
// follow chatML chat template when writing to prompt
promptMsg.push({
role: botAuthoredMsgSequence ? 'assistant' : 'user',
content: discordMsgs.join('\n')
});
discordMsgs = [];
// bot will pretend to author a random number of msg sequences
botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
@ -78,35 +158,34 @@ async function main() {
}
lastMsgTime = msgTime;
// 10 msg sequences per "conversation"
// 10 msg sequences per prompt message
if (convoMsgSeqCount === 10) {
// dropout
const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
if (convoKeep) {
// write JSONL format
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
}
convoMsgSeqCount = convoReactCount = 0;
convoMsgs = [];
promptMsg = [];
discordMsgs = [];
convoRefs = {};
}
// follow chatML chat template
const outMsg = {
role: botAuthoredMsgSequence ? "assistant" : "user",
content: cleanContent
};
convoMsgs.push(outMsg);
convoRefs[msg.id] = cleanContent;
// write a single discord message to the prompt
discordMsgs.push(structurePrompt(msg, cleanContent));
if (++counter % 1000 === 0) {
console.log(counter + " messages written");
}
});
stream.on('close', async () => {
if (convoMsgs.length) {
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
if (promptMsg.length) {
fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
}
console.log("Done!");
});