New training data prep scripts
This commit is contained in:
		
							parent
							
								
									f755a03eb2
								
							
						
					
					
						commit
						8159b11f4f
					
				| @ -1,9 +1,17 @@ | |||||||
|  | /** | ||||||
|  |  * procToxicQA.js | ||||||
|  |  * This script assumes you have ToxicQA (https://huggingface.co/datasets/NobodyExistsOnTheInternet/toxicqa/blob/main/finalToxicQA.jsonl)
 | ||||||
|  |  * downloaded at 'toxicQA.json'. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
| const fs = require('node:fs'); | const fs = require('node:fs'); | ||||||
| var lineReader = require('readline').createInterface({ | var lineReader = require('readline').createInterface({ | ||||||
|     input: fs.createReadStream('toxicQA.json') |     input: fs.createReadStream('toxicQA.json') | ||||||
| }); | }); | ||||||
| var outstream = fs.createWriteStream('toxicQAfinal.json'); | var outstream = fs.createWriteStream('toxicQAfinal.json'); | ||||||
| fs.unlinkSync('toxicQAfinal.json'); | if (fs.existsSync('toxicQAfinal.json')) { | ||||||
|  |     fs.unlinkSync('toxicQAfinal.json'); | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| lineReader.on('line', function (line) { | lineReader.on('line', function (line) { | ||||||
|     const dialogue = JSON.parse(line)["conversations"]; |     const dialogue = JSON.parse(line)["conversations"]; | ||||||
|  | |||||||
							
								
								
									
										131
									
								
								data/process.js
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								data/process.js
									
									
									
									
									
								
							| @ -3,7 +3,7 @@ const JSONStream = require('JSONStream'); | |||||||
| 
 | 
 | ||||||
| const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | ||||||
| const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 | const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 | ||||||
| const USERNAMES = [ | const SAVED_USERNAMES = [       // usernames as they were recorded in the chat log (@nickname)
 | ||||||
|     'vinny volcano\uD83C\uDF0B (伊胜焱)', |     'vinny volcano\uD83C\uDF0B (伊胜焱)', | ||||||
|     'Server Comp!', |     'Server Comp!', | ||||||
|     'Make The Map \uD83D\uDDFA', |     'Make The Map \uD83D\uDDFA', | ||||||
| @ -18,7 +18,23 @@ const USERNAMES = [ | |||||||
|     'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42', |     'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42', | ||||||
|     'Nicolaid', |     'Nicolaid', | ||||||
|     'epbic', |     'epbic', | ||||||
|  |     'Cap’n Vincent 🏴☠🏝', | ||||||
|  |     '1715 Galleonpilled Skipchud ⚓🦜', | ||||||
|  |     'me gold doubloons🏴☠🏆', | ||||||
|  |     'Boatswain Samuel ⚓⛵ 🌊' | ||||||
| ]; | ]; | ||||||
|  | const REAL_NAMES = {        // username to real name mapping
 | ||||||
|  |     'vinso1445': 'Vincent Iannelli', | ||||||
|  |     'scoliono': 'James Shiffer', | ||||||
|  |     'gnuwu': 'David Zheng', | ||||||
|  |     'f0oby': 'Myles Linden', | ||||||
|  |     'bapazheng': 'Myles Linden', | ||||||
|  |     'bapabakshi': 'Myles Linden', | ||||||
|  |     'keliande27': 'Myles Linden', | ||||||
|  |     '1thinker': 'Samuel Habib', | ||||||
|  |     'adam28405': 'Adam Kazerounian', | ||||||
|  |     'shibe.mp4': 'Jake Wong' | ||||||
|  | }; | ||||||
| 
 | 
 | ||||||
| async function main() { | async function main() { | ||||||
|     let counter = 0; |     let counter = 0; | ||||||
| @ -32,9 +48,77 @@ async function main() { | |||||||
|     let botAuthoredMsgSequence; |     let botAuthoredMsgSequence; | ||||||
|     let convoMsgSeqCount = 0; |     let convoMsgSeqCount = 0; | ||||||
|     let convoReactCount = 0; |     let convoReactCount = 0; | ||||||
|     let convoMsgs = []; |     let promptMsg = [];     // group of formatted msg seqs to be written to one line of the final dataset
 | ||||||
|  |     let discordMsgs = [];    // a single formatted message sequence
 | ||||||
|     let convoRefs = {}; |     let convoRefs = {}; | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  |     /** | ||||||
|  |      * Langchain structured output | ||||||
|  |      * | ||||||
|  |      * Beneath a few layers of abstraction, the finetuned model is ultimately prompted like so: | ||||||
|  |      * | ||||||
|  |      * ``` | ||||||
|  |      * USER: | ||||||
|  |      * Answer the user query. | ||||||
|  |      * [ Langchain JSON structured output instructions ] | ||||||
|  |      * { ... "author": "vinso", "content": "message history 1" ... } | ||||||
|  |      * { ... "author": "f0oby", "content": "message history 2" ... } | ||||||
|  |      * { ... "author": "scoliono", "content": "message history 3" ... } | ||||||
|  |      * | ||||||
|  |      * | ||||||
|  |      * ASSISTANT: | ||||||
|  |      * { ... "author": "Hatsune Miku", "content": "message history 1" ... } | ||||||
|  |      * ``` | ||||||
|  |      * | ||||||
|  |      * To this end, we have a function to format Discord messages in the same format as the | ||||||
|  |      * Pydantic object seen by Langchain. (The Langchain-specific instructions are not included.) | ||||||
|  |      * | ||||||
|  |      * Each turn by the user or assistant in the LLM-level conversation is henceforth known as a | ||||||
|  |      * "prompt message". The individual JSON lines in this example are supposed to represent | ||||||
|  |      * Discord messages, with one prompt message containing a "message sequence"'s worth. In the | ||||||
|  |      * actual JSONL dataset, though, one line represents 10 message sequences. | ||||||
|  |      * | ||||||
|  |      * Note: the training data will sometimes have multiple Discord messages in a single assistant | ||||||
|  |      * message sequence. Although it may seem unorthodox to have an LLM double-text you, this is | ||||||
|  |      * supposed to emulate a real Discord conversation, and real people have a tendency to split up | ||||||
|  |      * a thought across multiple messages. It's up to the inference code to decide what to do with | ||||||
|  |      * this. | ||||||
|  |      */ | ||||||
|  |     function structurePrompt(msg, cleanContent) { | ||||||
|  |         /** | ||||||
|  |          * Handle replies by maintaining a sliding window of message references. | ||||||
|  |          * If the replied-to message is too old to be part of this conversation, then leave this | ||||||
|  |          * message alone. If it's recent, then embed it as context for this message. | ||||||
|  |          */ | ||||||
|  |         let repliedToContent; | ||||||
|  |         if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { | ||||||
|  |             repliedToContent = convoRefs[msg.reference.messageId]; | ||||||
|  |         } | ||||||
|  |         // record reactions the message got in a compact string form
 | ||||||
|  |         let reactionString; | ||||||
|  |         for (const reaction of msg.reactions) { | ||||||
|  |             if (reactionString === undefined) { | ||||||
|  |                 reactionString = ''; | ||||||
|  |             } | ||||||
|  |             if (reactionString && reactionString.length > 0) { | ||||||
|  |                 reactionString += ', '; | ||||||
|  |             } | ||||||
|  |             reactionString += `:${reaction.emoji.code}: (${reaction.count})`; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         // 'name', 'context', 'reactions' could be undefined, in which case those fields are omitted
 | ||||||
|  |         return JSON.stringify({ | ||||||
|  |             timestamp: (new Date(msg.timestamp)).toUTCString(), | ||||||
|  |             author: msg.author.name, | ||||||
|  |             name: REAL_NAMES[msg.author.name], | ||||||
|  |             context: repliedToContent, | ||||||
|  |             content: cleanContent, | ||||||
|  |             reactions: reactionString | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|     stream.on('data', async (msg) => { |     stream.on('data', async (msg) => { | ||||||
|         // no bot/system messages
 |         // no bot/system messages
 | ||||||
|         if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { |         if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { | ||||||
| @ -44,25 +128,13 @@ async function main() { | |||||||
|         // scrub links
 |         // scrub links
 | ||||||
|         let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); |         let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); | ||||||
|         // scrub @mentions
 |         // scrub @mentions
 | ||||||
|         for (const username of USERNAMES) { |         for (const username of SAVED_USERNAMES) { | ||||||
|             cleanContent = cleanContent.replaceAll(`@${username}`, ""); |             cleanContent = cleanContent.replaceAll(`@${username}`, ""); | ||||||
|         } |         } | ||||||
|         if (!cleanContent) { |         if (!cleanContent) { | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         /** |  | ||||||
|          * Handle replies by maintaining a sliding window of message references. |  | ||||||
|          * If the replied-to message is too old to be part of this conversation, then leave this |  | ||||||
|          * message alone. |  | ||||||
|          * If it's recent, then embed it as context for this message, using the old-fashioned |  | ||||||
|          * reply syntax: "> original message \n reply message" |  | ||||||
|          */ |  | ||||||
|         if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { |  | ||||||
|             const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n'); |  | ||||||
|             cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`; |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         // count reaction
 |         // count reaction
 | ||||||
|         convoReactCount += msg.reactions.length; |         convoReactCount += msg.reactions.length; | ||||||
| 
 | 
 | ||||||
| @ -71,6 +143,14 @@ async function main() { | |||||||
|         if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { |         if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { | ||||||
|             lastMsgAuthor = msg.author.id; |             lastMsgAuthor = msg.author.id; | ||||||
| 
 | 
 | ||||||
|  |             // follow chatML chat template when writing to prompt
 | ||||||
|  |             promptMsg.push({ | ||||||
|  |                 role: botAuthoredMsgSequence ? 'assistant' : 'user', | ||||||
|  |                 content: discordMsgs.join('\n') | ||||||
|  |             }); | ||||||
|  | 
 | ||||||
|  |             discordMsgs = []; | ||||||
|  | 
 | ||||||
|             // bot will pretend to author a random number of msg sequences
 |             // bot will pretend to author a random number of msg sequences
 | ||||||
|             botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; |             botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; | ||||||
| 
 | 
 | ||||||
| @ -78,35 +158,34 @@ async function main() { | |||||||
|         } |         } | ||||||
|         lastMsgTime = msgTime; |         lastMsgTime = msgTime; | ||||||
| 
 | 
 | ||||||
|         // 10 msg sequences per "conversation"
 |         // 10 msg sequences per prompt message
 | ||||||
|         if (convoMsgSeqCount === 10) { |         if (convoMsgSeqCount === 10) { | ||||||
|             // dropout
 |             // dropout
 | ||||||
|             const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY; |             const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY; | ||||||
|             if (convoKeep) { |             if (convoKeep) { | ||||||
|                 // write JSONL format
 |                 // write JSONL format
 | ||||||
|                 fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); |                 fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n'); | ||||||
|             } |             } | ||||||
|             convoMsgSeqCount = convoReactCount = 0; |             convoMsgSeqCount = convoReactCount = 0; | ||||||
|             convoMsgs = []; |             promptMsg = []; | ||||||
|  |             discordMsgs = []; | ||||||
|             convoRefs = {}; |             convoRefs = {}; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         // follow chatML chat template
 |  | ||||||
|         const outMsg = { |  | ||||||
|             role: botAuthoredMsgSequence ? "assistant" : "user", |  | ||||||
|             content: cleanContent |  | ||||||
|         }; |  | ||||||
|         convoMsgs.push(outMsg); |  | ||||||
|         convoRefs[msg.id] = cleanContent; |         convoRefs[msg.id] = cleanContent; | ||||||
| 
 | 
 | ||||||
|  |         // write a single discord message to the prompt
 | ||||||
|  |         discordMsgs.push(structurePrompt(msg, cleanContent)); | ||||||
|  | 
 | ||||||
|         if (++counter % 1000 === 0) { |         if (++counter % 1000 === 0) { | ||||||
|             console.log(counter + " messages written"); |             console.log(counter + " messages written"); | ||||||
|         } |         } | ||||||
|     }); |     }); | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|     stream.on('close', async () => { |     stream.on('close', async () => { | ||||||
|         if (convoMsgs.length) { |         if (promptMsg.length) { | ||||||
|             fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); |             fs.appendFileSync('output.json', JSON.stringify(promptMsg) + '\n'); | ||||||
|         } |         } | ||||||
|         console.log("Done!"); |         console.log("Done!"); | ||||||
|     }); |     }); | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user