diff --git a/data/procToxicQA.js b/data/procToxicQA.js index 7ecc75e..dec788d 100644 --- a/data/procToxicQA.js +++ b/data/procToxicQA.js @@ -1,9 +1,17 @@ +/** + * procToxicQA.js + * This script assumes you have ToxicQA (https://huggingface.co/datasets/NobodyExistsOnTheInternet/toxicqa/blob/main/finalToxicQA.jsonl) + * downloaded at 'toxicQA.json'. + */ + const fs = require('node:fs'); var lineReader = require('readline').createInterface({ input: fs.createReadStream('toxicQA.json') }); var outstream = fs.createWriteStream('toxicQAfinal.json'); -fs.unlinkSync('toxicQAfinal.json'); +if (fs.existsSync('toxicQAfinal.json')) { + fs.unlinkSync('toxicQAfinal.json'); +} lineReader.on('line', function (line) { const dialogue = JSON.parse(line)["conversations"]; diff --git a/data/process.js b/data/process.js index 8244360..9ec52cc 100644 --- a/data/process.js +++ b/data/process.js @@ -3,7 +3,7 @@ const JSONStream = require('JSONStream'); const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions -const USERNAMES = [ +const SAVED_USERNAMES = [ // usernames as they were recorded in the chat log (@nickname) 'vinny volcano\uD83C\uDF0B (ไผŠ่ƒœ็„ฑ)', 'Server Comp!', 'Make The Map \uD83D\uDDFA', @@ -18,7 +18,23 @@ const USERNAMES = [ 'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42', 'Nicolaid', 'epbic', + 'Capโ€™n Vincent ๐Ÿดโ˜ ๐Ÿ', + '1715 Galleonpilled Skipchud โš“๐Ÿฆœ', + 'me gold doubloons๐Ÿดโ˜ ๐Ÿ†', + 'Boatswain Samuel โš“โ›ต ๐ŸŒŠ' ]; +const REAL_NAMES = { // username to real name mapping + 'vinso1445': 'Vincent Iannelli', + 'scoliono': 'James Shiffer', + 'gnuwu': 'David Zheng', + 'f0oby': 'Myles Linden', + 'bapazheng': 'Myles Linden', + 'bapabakshi': 'Myles Linden', + 'keliande27': 'Myles Linden', + '1thinker': 'Samuel Habib', + 'adam28405': 'Adam Kazerounian', + 'shibe.mp4': 'Jake Wong' +}; async function main() { let counter = 0; @@ -32,9 +48,77 @@ async function main() { let botAuthoredMsgSequence; let convoMsgSeqCount = 0; let convoReactCount = 0; - let convoMsgs = []; + let promptMsg = []; // group of formatted msg seqs to be written to one line of the final dataset + let discordMsgs = []; // a single formatted message sequence let convoRefs = {}; + + /** + * Langchain structured output + * + * Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so: + * + * ``` + * USER: + * Answer the user query. + * [ Langchain JSON structured output instructions ] + * { ... "author": "vinso", "content": "message history 1" ... } + * { ... "author": "f0oby", "content": "message history 2" ... } + * { ... "author": "scoliono", "content": "message history 3" ... } + * + * + * ASSISTANT: + * { ... "author": "Hatsune Miku", "content": "message history 1" ... } + * ``` + * + * To this end, we have a function to format Discord messages in the same format as the + * Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.) + * + * Each turn by the user or assistant in the LLM-level conversation is henceforth known as a + * "prompt message". The individual JSON lines in this example are supposed to represent + * Discord messages, with one prompt message containing a "message sequence"'s worth. In the + * actual JSONL dataset, though, one line represents 10 message sequences. + * + * Note: the training data will sometimes have multiple Discord messages in a single assistant + * message sequence. Although it may seem unorthodox to have an LLM double-text you, this is + * supposed to emulate a real Discord conversation, and real people have a tendency to split up + * a thought across multiple messages. It's up to the inference code to decide what to do with + * this. + */ + function structurePrompt(msg, cleanContent) { + /** + * Handle replies by maintaining a sliding window of message references. + * If the replied-to message is too old to be part of this conversation, then leave this + * message alone. If it's recent, then embed it as context for this message. + */ + let repliedToContent; + if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { + repliedToContent = convoRefs[msg.reference.messageId]; + } + // record reactions the message got in a compact string form + let reactionString; + for (const reaction of msg.reactions) { + if (reactionString === undefined) { + reactionString = ''; + } + if (reactionString && reactionString.length > 0) { + reactionString += ', '; + } + reactionString += `:${reaction.emoji.code}: (${reaction.count})`; + } + + // 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted + return JSON.stringify({ + timestamp: (new Date(msg.timestamp)).toUTCString(), + author: msg.author.name, + name: REAL_NAMES[msg.author.name], + context: repliedToContent, + content: cleanContent, + reactions: reactionString + }); + } + + stream.on('data', async (msg) => { // no bot/system messages if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { @@ -44,25 +128,13 @@ async function main() { // scrub links let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); // scrub @mentions - for (const username of USERNAMES) { + for (const username of SAVED_USERNAMES) { cleanContent = cleanContent.replaceAll(`@${username}`, ""); } if (!cleanContent) { return; } - /** - * Handle replies by maintaining a sliding window of message references. - * If the replied-to message is too old to be part of this conversation, then leave this - * message alone. - * If it's recent, then embed it as context for this message, using the old-fashioned - * reply syntax: "> original message \n reply message" - */ - if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { - const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n'); - cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`; - } - // count reaction convoReactCount += msg.reactions.length; @@ -71,6 +143,14 @@ async function main() { if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { lastMsgAuthor = msg.author.id; + // follow chatML chat template when writing to prompt + promptMsg.push({ + role: botAuthoredMsgSequence ? 'assistant' : 'user', + content: discordMsgs.join('\n') + }); + + discordMsgs = []; + // bot will pretend to author a random number of msg sequences botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; @@ -78,35 +158,34 @@ async function main() { } lastMsgTime = msgTime; - // 10 msg sequences per "conversation" + // 10 msg sequences per prompt message if (convoMsgSeqCount === 10) { // dropout const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY; if (convoKeep) { // write JSONL format - fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); + fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n'); } convoMsgSeqCount = convoReactCount = 0; - convoMsgs = []; + promptMsg = []; + discordMsgs = []; convoRefs = {}; } - // follow chatML chat template - const outMsg = { - role: botAuthoredMsgSequence ? "assistant" : "user", - content: cleanContent - }; - convoMsgs.push(outMsg); convoRefs[msg.id] = cleanContent; + // write a single discord message to the prompt + discordMsgs.push(structurePrompt(msg, cleanContent)); + if (++counter % 1000 === 0) { console.log(counter + " messages written"); } }); + stream.on('close', async () => { - if (convoMsgs.length) { - fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); + if (promptMsg.length) { + fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n'); } console.log("Done!"); });