Anonymize usernames again, and try including reply context
This commit is contained in:
		
							parent
							
								
									cf09ad5a77
								
							
						
					
					
						commit
						87e9cc39e0
					
				| @ -2,7 +2,7 @@ const fs = require('node:fs'); | ||||
| const JSONStream = require('JSONStream'); | ||||
| 
 | ||||
| const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | ||||
| const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for messages chains which have NO reactions
 | ||||
| const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 | ||||
| const USERNAMES = [ | ||||
|     'vinny volcano\uD83C\uDF0B (伊胜焱)', | ||||
|     'Server Comp!', | ||||
| @ -33,6 +33,7 @@ async function main() { | ||||
|     let convoMsgSeqCount = 0; | ||||
|     let convoReactCount = 0; | ||||
|     let convoMsgs = []; | ||||
|     let convoRefs = {}; | ||||
| 
 | ||||
|     stream.on('data', async (msg) => { | ||||
|         // no bot/system messages
 | ||||
| @ -40,18 +41,6 @@ async function main() { | ||||
|             return; | ||||
|         } | ||||
| 
 | ||||
|         /** | ||||
|          * Replies are a tricky case. I considered pasting their context in, except this | ||||
|          * requires keeping a full message cache and won't scale. Another possibility is | ||||
|          * to maintain a small sliding window of message history and delete replies which | ||||
|          * reference a message too far in the past... but what if that reply gets replied | ||||
|          * to right after? Our chat in particular has a lot of these "necro" messages, but | ||||
|          * since they tend to spark further discussion if anything, they probably don't | ||||
|          * noticeably obfuscate the flow of conversation compared to normal time skips, | ||||
|          * which our model is incapable of picking up in the first place. | ||||
|          * TLDR: Keep the replies. Making too many assumptions is bad. | ||||
|          */ | ||||
| 
 | ||||
|         // scrub links
 | ||||
|         let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); | ||||
|         // scrub @mentions
 | ||||
| @ -62,6 +51,18 @@ async function main() { | ||||
|             return; | ||||
|         } | ||||
| 
 | ||||
|         /** | ||||
|          * Handle replies by maintaining a sliding window of message references. | ||||
|          * If the replied-to message is too old to be part of this conversation, then leave this | ||||
|          * message alone. | ||||
|          * If it's recent, then embed it as context for this message, using the old-fashioned | ||||
|          * reply syntax: "> original message \n reply message" | ||||
|          */ | ||||
|         if (msg.type === "Reply" && msg.reference.messageId in convoRefs) { | ||||
|             const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n'); | ||||
|             cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`; | ||||
|         } | ||||
| 
 | ||||
|         // count reaction
 | ||||
|         convoReactCount += msg.reactions.length; | ||||
| 
 | ||||
| @ -87,14 +88,16 @@ async function main() { | ||||
|             } | ||||
|             convoMsgSeqCount = convoReactCount = 0; | ||||
|             convoMsgs = []; | ||||
|             convoRefs = {}; | ||||
|         } | ||||
| 
 | ||||
|         // follow chatML chat template
 | ||||
|         const outMsg = { | ||||
|             role: botAuthoredMsgSequence ? "assistant" : msg.author.name, | ||||
|             role: botAuthoredMsgSequence ? "assistant" : "user", | ||||
|             content: cleanContent | ||||
|         }; | ||||
|         convoMsgs.push(outMsg); | ||||
|         convoRefs[msg.id] = cleanContent; | ||||
| 
 | ||||
|         if (++counter % 1000 === 0) { | ||||
|             console.log(counter + " messages written"); | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user