Anonymize usernames again, and try including reply context
This commit is contained in:
		
							parent
							
								
									cf09ad5a77
								
							
						
					
					
						commit
						87e9cc39e0
					
				@ -2,7 +2,7 @@ const fs = require('node:fs');
 | 
			
		||||
const JSONStream = require('JSONStream');
 | 
			
		||||
 | 
			
		||||
const MIKU_FREQ = 5;            // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | 
			
		||||
const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for messages chains which have NO reactions
 | 
			
		||||
const DROPOUT_UNFUNNY = 0.75;   // 75% dropout rate for message chains which have NO reactions
 | 
			
		||||
const USERNAMES = [
 | 
			
		||||
    'vinny volcano\uD83C\uDF0B (伊胜焱)',
 | 
			
		||||
    'Server Comp!',
 | 
			
		||||
@ -33,6 +33,7 @@ async function main() {
 | 
			
		||||
    let convoMsgSeqCount = 0;
 | 
			
		||||
    let convoReactCount = 0;
 | 
			
		||||
    let convoMsgs = [];
 | 
			
		||||
    let convoRefs = {};
 | 
			
		||||
 | 
			
		||||
    stream.on('data', async (msg) => {
 | 
			
		||||
        // no bot/system messages
 | 
			
		||||
@ -40,18 +41,6 @@ async function main() {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        /**
 | 
			
		||||
         * Replies are a tricky case. I considered pasting their context in, except this
 | 
			
		||||
         * requires keeping a full message cache and won't scale. Another possibility is
 | 
			
		||||
         * to maintain a small sliding window of message history and delete replies which
 | 
			
		||||
         * reference a message too far in the past... but what if that reply gets replied
 | 
			
		||||
         * to right after? Our chat in particular has a lot of these "necro" messages, but
 | 
			
		||||
         * since they tend to spark further discussion if anything, they probably don't
 | 
			
		||||
         * noticeably obfuscate the flow of conversation compared to normal time skips,
 | 
			
		||||
         * which our model is incapable of picking up in the first place.
 | 
			
		||||
         * TLDR: Keep the replies. Making too many assumptions is bad.
 | 
			
		||||
         */
 | 
			
		||||
 | 
			
		||||
        // scrub links
 | 
			
		||||
        let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
 | 
			
		||||
        // scrub @mentions
 | 
			
		||||
@ -62,6 +51,18 @@ async function main() {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        /**
 | 
			
		||||
         * Handle replies by maintaining a sliding window of message references.
 | 
			
		||||
         * If the replied-to message is too old to be part of this conversation, then leave this
 | 
			
		||||
         * message alone.
 | 
			
		||||
         * If it's recent, then embed it as context for this message, using the old-fashioned
 | 
			
		||||
         * reply syntax: "> original message \n reply message"
 | 
			
		||||
         */
 | 
			
		||||
        if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
 | 
			
		||||
            const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
 | 
			
		||||
            cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // count reaction
 | 
			
		||||
        convoReactCount += msg.reactions.length;
 | 
			
		||||
 | 
			
		||||
@ -87,14 +88,16 @@ async function main() {
 | 
			
		||||
            }
 | 
			
		||||
            convoMsgSeqCount = convoReactCount = 0;
 | 
			
		||||
            convoMsgs = [];
 | 
			
		||||
            convoRefs = {};
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // follow chatML chat template
 | 
			
		||||
        const outMsg = {
 | 
			
		||||
            role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
 | 
			
		||||
            role: botAuthoredMsgSequence ? "assistant" : "user",
 | 
			
		||||
            content: cleanContent
 | 
			
		||||
        };
 | 
			
		||||
        convoMsgs.push(outMsg);
 | 
			
		||||
        convoRefs[msg.id] = cleanContent;
 | 
			
		||||
 | 
			
		||||
        if (++counter % 1000 === 0) {
 | 
			
		||||
            console.log(counter + " messages written");
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user