New training data prep scripts
This commit is contained in:
		
							parent
							
								
									f755a03eb2
								
							
						
					
					
						commit
						8159b11f4f
					
				@ -1,9 +1,17 @@
 | 
			
		||||
/**
 | 
			
		||||
 * procToxicQA.js
 | 
			
		||||
 * This script assumes you have ToxicQA (https://huggingface.co/datasets/NobodyExistsOnTheInternet/toxicqa/blob/main/finalToxicQA.jsonl)
 | 
			
		||||
 * downloaded at 'toxicQA.json'.
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
const fs = require('node:fs');
 | 
			
		||||
var lineReader = require('readline').createInterface({
 | 
			
		||||
    input: fs.createReadStream('toxicQA.json')
 | 
			
		||||
});
 | 
			
		||||
var outstream = fs.createWriteStream('toxicQAfinal.json');
 | 
			
		||||
fs.unlinkSync('toxicQAfinal.json');
 | 
			
		||||
if (fs.existsSync('toxicQAfinal.json')) {
 | 
			
		||||
    fs.unlinkSync('toxicQAfinal.json');
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
lineReader.on('line', function (line) {
 | 
			
		||||
    const dialogue = JSON.parse(line)["conversations"];
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										131
									
								
								data/process.js
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								data/process.js
									
									
									
									
									
								
							@ -3,7 +3,7 @@ const JSONStream = require('JSONStream');
 | 
			
		||||
 | 
			
		||||
const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | 
			
		||||
const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 | 
			
		||||
const USERNAMES = [
 | 
			
		||||
const SAVED_USERNAMES = [       // usernames as they were recorded in the chat log (@nickname)
 | 
			
		||||
    'vinny volcano\uD83C\uDF0B (伊胜焱)',
 | 
			
		||||
    'Server Comp!',
 | 
			
		||||
    'Make The Map \uD83D\uDDFA',
 | 
			
		||||
@ -18,7 +18,23 @@ const USERNAMES = [
 | 
			
		||||
    'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
 | 
			
		||||
    'Nicolaid',
 | 
			
		||||
    'epbic',
 | 
			
		||||
    'Cap’n Vincent 🏴☠🏝',
 | 
			
		||||
    '1715 Galleonpilled Skipchud ⚓🦜',
 | 
			
		||||
    'me gold doubloons🏴☠🏆',
 | 
			
		||||
    'Boatswain Samuel ⚓⛵ 🌊'
 | 
			
		||||
];
 | 
			
		||||
const REAL_NAMES = {        // username to real name mapping
 | 
			
		||||
    'vinso1445': 'Vincent Iannelli',
 | 
			
		||||
    'scoliono': 'James Shiffer',
 | 
			
		||||
    'gnuwu': 'David Zheng',
 | 
			
		||||
    'f0oby': 'Myles Linden',
 | 
			
		||||
    'bapazheng': 'Myles Linden',
 | 
			
		||||
    'bapabakshi': 'Myles Linden',
 | 
			
		||||
    'keliande27': 'Myles Linden',
 | 
			
		||||
    '1thinker': 'Samuel Habib',
 | 
			
		||||
    'adam28405': 'Adam Kazerounian',
 | 
			
		||||
    'shibe.mp4': 'Jake Wong'
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
async function main() {
 | 
			
		||||
    let counter = 0;
 | 
			
		||||
@ -32,9 +48,77 @@ async function main() {
 | 
			
		||||
    let botAuthoredMsgSequence;
 | 
			
		||||
    let convoMsgSeqCount = 0;
 | 
			
		||||
    let convoReactCount = 0;
 | 
			
		||||
    let convoMsgs = [];
 | 
			
		||||
    let promptMsg = [];     // group of formatted msg seqs to be written to one line of the final dataset
 | 
			
		||||
    let discordMsgs = [];    // a single formatted message sequence
 | 
			
		||||
    let convoRefs = {};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Langchain structured output
 | 
			
		||||
     *
 | 
			
		||||
     * Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so:
 | 
			
		||||
     *
 | 
			
		||||
     * ```
 | 
			
		||||
     * USER:
 | 
			
		||||
     * Answer the user query.
 | 
			
		||||
     * [ Langchain JSON structured output instructions ]
 | 
			
		||||
     * { ... "author": "vinso", "content": "message history 1" ... }
 | 
			
		||||
     * { ... "author": "f0oby", "content": "message history 2" ... }
 | 
			
		||||
     * { ... "author": "scoliono", "content": "message history 3" ... }
 | 
			
		||||
     *
 | 
			
		||||
     *
 | 
			
		||||
     * ASSISTANT:
 | 
			
		||||
     * { ... "author": "Hatsune Miku", "content": "message history 1" ... }
 | 
			
		||||
     * ```
 | 
			
		||||
     *
 | 
			
		||||
     * To this end, we have a function to format Discord messages in the same format as the
 | 
			
		||||
     * Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.)
 | 
			
		||||
     *
 | 
			
		||||
     * Each turn by the user or assistant in the LLM-level conversation is henceforth known as a
 | 
			
		||||
     * "prompt message". The individual JSON lines in this example are supposed to represent
 | 
			
		||||
     * Discord messages, with one prompt message containing a "message sequence"'s worth. In the
 | 
			
		||||
     * actual JSONL dataset, though, one line represents 10 message sequences.
 | 
			
		||||
     *
 | 
			
		||||
     * Note: the training data will sometimes have multiple Discord messages in a single assistant
 | 
			
		||||
     * message sequence. Although it may seem unorthodox to have an LLM double-text you, this is
 | 
			
		||||
     * supposed to emulate a real Discord conversation, and real people have a tendency to split up
 | 
			
		||||
     * a thought across multiple messages. It's up to the inference code to decide what to do with
 | 
			
		||||
     * this.
 | 
			
		||||
     */
 | 
			
		||||
    function structurePrompt(msg, cleanContent) {
 | 
			
		||||
        /**
 | 
			
		||||
         * Handle replies by maintaining a sliding window of message references.
 | 
			
		||||
         * If the replied-to message is too old to be part of this conversation, then leave this
 | 
			
		||||
         * message alone. If it's recent, then embed it as context for this message.
 | 
			
		||||
         */
 | 
			
		||||
        let repliedToContent;
 | 
			
		||||
        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
 | 
			
		||||
            repliedToContent = convoRefs[msg.reference.messageId];
 | 
			
		||||
        }
 | 
			
		||||
        // record reactions the message got in a compact string form
 | 
			
		||||
        let reactionString;
 | 
			
		||||
        for (const reaction of msg.reactions) {
 | 
			
		||||
            if (reactionString === undefined) {
 | 
			
		||||
                reactionString = '';
 | 
			
		||||
            }
 | 
			
		||||
            if (reactionString && reactionString.length > 0) {
 | 
			
		||||
                reactionString += ', ';
 | 
			
		||||
            }
 | 
			
		||||
            reactionString += `:${reaction.emoji.code}: (${reaction.count})`;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
 | 
			
		||||
        return JSON.stringify({
 | 
			
		||||
            timestamp: (new Date(msg.timestamp)).toUTCString(),
 | 
			
		||||
            author: msg.author.name,
 | 
			
		||||
            name: REAL_NAMES[msg.author.name],
 | 
			
		||||
            context: repliedToContent,
 | 
			
		||||
            content: cleanContent,
 | 
			
		||||
            reactions: reactionString
 | 
			
		||||
        });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    stream.on('data', async (msg) => {
 | 
			
		||||
        // no bot/system messages
 | 
			
		||||
        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
 | 
			
		||||
@ -44,25 +128,13 @@ async function main() {
 | 
			
		||||
        // scrub links
 | 
			
		||||
        let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
 | 
			
		||||
        // scrub @mentions
 | 
			
		||||
        for (const username of USERNAMES) {
 | 
			
		||||
        for (const username of SAVED_USERNAMES) {
 | 
			
		||||
            cleanContent = cleanContent.replaceAll(`@${username}`, "");
 | 
			
		||||
        }
 | 
			
		||||
        if (!cleanContent) {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        /**
 | 
			
		||||
         * Handle replies by maintaining a sliding window of message references.
 | 
			
		||||
         * If the replied-to message is too old to be part of this conversation, then leave this
 | 
			
		||||
         * message alone.
 | 
			
		||||
         * If it's recent, then embed it as context for this message, using the old-fashioned
 | 
			
		||||
         * reply syntax: "> original message \n reply message"
 | 
			
		||||
         */
 | 
			
		||||
        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
 | 
			
		||||
            const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
 | 
			
		||||
            cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // count reaction
 | 
			
		||||
        convoReactCount += msg.reactions.length;
 | 
			
		||||
 | 
			
		||||
@ -71,6 +143,14 @@ async function main() {
 | 
			
		||||
        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
 | 
			
		||||
            lastMsgAuthor = msg.author.id;
 | 
			
		||||
 | 
			
		||||
            // follow chatML chat template when writing to prompt
 | 
			
		||||
            promptMsg.push({
 | 
			
		||||
                role: botAuthoredMsgSequence ? 'assistant' : 'user',
 | 
			
		||||
                content: discordMsgs.join('\n')
 | 
			
		||||
            });
 | 
			
		||||
 | 
			
		||||
            discordMsgs = [];
 | 
			
		||||
 | 
			
		||||
            // bot will pretend to author a random number of msg sequences
 | 
			
		||||
            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
 | 
			
		||||
 | 
			
		||||
@ -78,35 +158,34 @@ async function main() {
 | 
			
		||||
        }
 | 
			
		||||
        lastMsgTime = msgTime;
 | 
			
		||||
 | 
			
		||||
        // 10 msg sequences per "conversation"
 | 
			
		||||
        // 10 msg sequences per prompt message
 | 
			
		||||
        if (convoMsgSeqCount === 10) {
 | 
			
		||||
            // dropout
 | 
			
		||||
            const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
 | 
			
		||||
            if (convoKeep) {
 | 
			
		||||
                // write JSONL format
 | 
			
		||||
                fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
 | 
			
		||||
                fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
 | 
			
		||||
            }
 | 
			
		||||
            convoMsgSeqCount = convoReactCount = 0;
 | 
			
		||||
            convoMsgs = [];
 | 
			
		||||
            promptMsg = [];
 | 
			
		||||
            discordMsgs = [];
 | 
			
		||||
            convoRefs = {};
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // follow chatML chat template
 | 
			
		||||
        const outMsg = {
 | 
			
		||||
            role: botAuthoredMsgSequence ? "assistant" : "user",
 | 
			
		||||
            content: cleanContent
 | 
			
		||||
        };
 | 
			
		||||
        convoMsgs.push(outMsg);
 | 
			
		||||
        convoRefs[msg.id] = cleanContent;
 | 
			
		||||
 | 
			
		||||
        // write a single discord message to the prompt
 | 
			
		||||
        discordMsgs.push(structurePrompt(msg, cleanContent));
 | 
			
		||||
 | 
			
		||||
        if (++counter % 1000 === 0) {
 | 
			
		||||
            console.log(counter + " messages written");
 | 
			
		||||
        }
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    stream.on('close', async () => {
 | 
			
		||||
        if (convoMsgs.length) {
 | 
			
		||||
            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
 | 
			
		||||
        if (promptMsg.length) {
 | 
			
		||||
            fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n');
 | 
			
		||||
        }
 | 
			
		||||
        console.log("Done!");
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user