Anonymize usernames again, and try including reply context
This commit is contained in:
parent
cf09ad5a77
commit
87e9cc39e0
@ -2,7 +2,7 @@ const fs = require('node:fs');
|
||||
const JSONStream = require('JSONStream');
|
||||
|
||||
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
|
||||
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for messages chains which have NO reactions
|
||||
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for message chains which have NO reactions
|
||||
const USERNAMES = [
|
||||
'vinny volcano\uD83C\uDF0B (伊胜焱)',
|
||||
'Server Comp!',
|
||||
@ -33,6 +33,7 @@ async function main() {
|
||||
let convoMsgSeqCount = 0;
|
||||
let convoReactCount = 0;
|
||||
let convoMsgs = [];
|
||||
let convoRefs = {};
|
||||
|
||||
stream.on('data', async (msg) => {
|
||||
// no bot/system messages
|
||||
@ -40,18 +41,6 @@ async function main() {
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replies are a tricky case. I considered pasting their context in, except this
|
||||
* requires keeping a full message cache and won't scale. Another possibility is
|
||||
* to maintain a small sliding window of message history and delete replies which
|
||||
* reference a message too far in the past... but what if that reply gets replied
|
||||
* to right after? Our chat in particular has a lot of these "necro" messages, but
|
||||
* since they tend to spark further discussion if anything, they probably don't
|
||||
* noticeably obfuscate the flow of conversation compared to normal time skips,
|
||||
* which our model is incapable of picking up in the first place.
|
||||
* TLDR: Keep the replies. Making too many assumptions is bad.
|
||||
*/
|
||||
|
||||
// scrub links
|
||||
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
|
||||
// scrub @mentions
|
||||
@ -62,6 +51,18 @@ async function main() {
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle replies by maintaining a sliding window of message references.
|
||||
* If the replied-to message is too old to be part of this conversation, then leave this
|
||||
* message alone.
|
||||
* If it's recent, then embed it as context for this message, using the old-fashioned
|
||||
* reply syntax: "> original message \n reply message"
|
||||
*/
|
||||
if (msg.type === "Reply" && msg.reference.messageId in convoRefs) {
|
||||
const repliedToContentLines = convoRefs[msg.reference.messageId].split('\n');
|
||||
cleanContent = `> ${repliedToContentLines.join('\n> ')}\n${cleanContent}`;
|
||||
}
|
||||
|
||||
// count reaction
|
||||
convoReactCount += msg.reactions.length;
|
||||
|
||||
@ -87,14 +88,16 @@ async function main() {
|
||||
}
|
||||
convoMsgSeqCount = convoReactCount = 0;
|
||||
convoMsgs = [];
|
||||
convoRefs = {};
|
||||
}
|
||||
|
||||
// follow chatML chat template
|
||||
const outMsg = {
|
||||
role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
|
||||
role: botAuthoredMsgSequence ? "assistant" : "user",
|
||||
content: cleanContent
|
||||
};
|
||||
convoMsgs.push(outMsg);
|
||||
convoRefs[msg.id] = cleanContent;
|
||||
|
||||
if (++counter % 1000 === 0) {
|
||||
console.log(counter + " messages written");
|
||||
|
Loading…
x
Reference in New Issue
Block a user