Anonymize usernames again, and try including reply context
This commit is contained in:
		
							parent
							
								
									cf09ad5a77
								
							
						
					
					
						commit
						87e9cc39e0
					
				@ -2,7 +2,7 @@ const fs = require('node:fs');
 | 
				
			|||||||
const JSONStream = require('JSONStream');
 | 
					const JSONStream = require('JSONStream');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | 
					const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | 
				
			||||||
const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for messages chains which have NO reactions
 | 
					const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 | 
				
			||||||
const USERNAMES = [
 | 
					const USERNAMES = [
 | 
				
			||||||
    'vinny volcano\uD83C\uDF0B (伊胜焱)',
 | 
					    'vinny volcano\uD83C\uDF0B (伊胜焱)',
 | 
				
			||||||
    'Server Comp!',
 | 
					    'Server Comp!',
 | 
				
			||||||
@ -33,6 +33,7 @@ async function main() {
 | 
				
			|||||||
    let convoMsgSeqCount = 0;
 | 
					    let convoMsgSeqCount = 0;
 | 
				
			||||||
    let convoReactCount = 0;
 | 
					    let convoReactCount = 0;
 | 
				
			||||||
    let convoMsgs = [];
 | 
					    let convoMsgs = [];
 | 
				
			||||||
 | 
					    let convoRefs = {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    stream.on('data', async (msg) => {
 | 
					    stream.on('data', async (msg) => {
 | 
				
			||||||
        // no bot/system messages
 | 
					        // no bot/system messages
 | 
				
			||||||
@ -40,18 +41,6 @@ async function main() {
 | 
				
			|||||||
            return;
 | 
					            return;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        /**
 | 
					 | 
				
			||||||
         * Replies are a tricky case. I considered pasting their context in, except this
 | 
					 | 
				
			||||||
         * requires keeping a full message cache and won't scale. Another possibility is
 | 
					 | 
				
			||||||
         * to maintain a small sliding window of message history and delete replies which
 | 
					 | 
				
			||||||
         * reference a message too far in the past... but what if that reply gets replied
 | 
					 | 
				
			||||||
         * to right after? Our chat in particular has a lot of these "necro" messages, but
 | 
					 | 
				
			||||||
         * since they tend to spark further discussion if anything, they probably don't
 | 
					 | 
				
			||||||
         * noticeably obfuscate the flow of conversation compared to normal time skips,
 | 
					 | 
				
			||||||
         * which our model is incapable of picking up in the first place.
 | 
					 | 
				
			||||||
         * TLDR: Keep the replies. Making too many assumptions is bad.
 | 
					 | 
				
			||||||
         */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // scrub links
 | 
					        // scrub links
 | 
				
			||||||
        let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
 | 
					        let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
 | 
				
			||||||
        // scrub @mentions
 | 
					        // scrub @mentions
 | 
				
			||||||
@ -62,6 +51,18 @@ async function main() {
 | 
				
			|||||||
            return;
 | 
					            return;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        /**
 | 
				
			||||||
 | 
					         * Handle replies by maintaining a sliding window of message references.
 | 
				
			||||||
 | 
					         * If the replied-to message is too old to be part of this conversation, then leave this
 | 
				
			||||||
 | 
					         * message alone.
 | 
				
			||||||
 | 
					         * If it's recent, then embed it as context for this message, using the old-fashioned
 | 
				
			||||||
 | 
					         * reply syntax: "> original message \n reply message"
 | 
				
			||||||
 | 
					         */
 | 
				
			||||||
 | 
					        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
 | 
				
			||||||
 | 
					            const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
 | 
				
			||||||
 | 
					            cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // count reaction
 | 
					        // count reaction
 | 
				
			||||||
        convoReactCount += msg.reactions.length;
 | 
					        convoReactCount += msg.reactions.length;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -87,14 +88,16 @@ async function main() {
 | 
				
			|||||||
            }
 | 
					            }
 | 
				
			||||||
            convoMsgSeqCount = convoReactCount = 0;
 | 
					            convoMsgSeqCount = convoReactCount = 0;
 | 
				
			||||||
            convoMsgs = [];
 | 
					            convoMsgs = [];
 | 
				
			||||||
 | 
					            convoRefs = {};
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // follow chatML chat template
 | 
					        // follow chatML chat template
 | 
				
			||||||
        const outMsg = {
 | 
					        const outMsg = {
 | 
				
			||||||
            role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
 | 
					            role: botAuthoredMsgSequence ? "assistant" : "user",
 | 
				
			||||||
            content: cleanContent
 | 
					            content: cleanContent
 | 
				
			||||||
        };
 | 
					        };
 | 
				
			||||||
        convoMsgs.push(outMsg);
 | 
					        convoMsgs.push(outMsg);
 | 
				
			||||||
 | 
					        convoRefs[msg.id] = cleanContent;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if (++counter % 1000 === 0) {
 | 
					        if (++counter % 1000 === 0) {
 | 
				
			||||||
            console.log(counter + " messages written");
 | 
					            console.log(counter + " messages written");
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user