From a572dc62f1eed03dda6a0964a91f2f766f175993 Mon Sep 17 00:00:00 2001 From: James S Date: Sat, 25 May 2024 04:35:24 +0000 Subject: [PATCH] Strip mentions, dropout for non-reaction message chains --- data/process.js | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/data/process.js b/data/process.js index 1fa36b8..4461d5d 100644 --- a/data/process.js +++ b/data/process.js @@ -1,7 +1,24 @@ const fs = require('node:fs'); const JSONStream = require('JSONStream'); -const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku +const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku +const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for messages chains which have NO reactions +const USERNAMES = [ + 'vinny volcano\uD83C\uDF0B (伊胜焱)', + 'Server Comp!', + 'Make The Map \uD83D\uDDFA', + '1981 Celical Man\uD83C\uDF41\uD83C\uDF42', + 'Hatsune Miku', + 'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42', + 'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42', + 'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱', + 'shibe.mp4❄☃', + 'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42', + 'owner', + 'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42', + 'Nicolaid', + 'epbic', +]; async function main() { let counter = 0; @@ -14,6 +31,7 @@ async function main() { let lastMsgTime; let botAuthoredMsgSequence; let convoMsgSeqCount = 0; + let convoReactCount = 0; let convoMsgs = []; stream.on('data', async (msg) => { @@ -35,11 +53,18 @@ async function main() { */ // scrub links - const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); + let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); + // scrub @mentions + for (const username of USERNAMES) { + cleanContent = cleanContent.replaceAll(`@${username}`, ""); + } if (!cleanContent) { return; } + // count reaction + convoReactCount += msg.reactions.length; + // determine continuity of message sequences let msgTime = new Date(msg.timestamp); if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { @@ -54,15 +79,19 @@ async function main() { // 10 msg sequences per "conversation" if (convoMsgSeqCount === 10) { - // write JSONL format - fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); - convoMsgSeqCount = 0; + // dropout + const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY; + if (convoKeep) { + // write JSONL format + fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); + } + convoMsgSeqCount = convoReactCount = 0; convoMsgs = []; } // follow chatML chat template const outMsg = { - role: botAuthoredMsgSequence ? "assistant" : "user", + role: botAuthoredMsgSequence ? "assistant" : msg.author.name, content: cleanContent }; convoMsgs.push(outMsg);