2024-04-06 01:38:46 -07:00
|
|
|
const fs = require('node:fs');
|
|
|
|
const JSONStream = require('JSONStream');
|
|
|
|
|
2024-05-25 04:35:24 +00:00
|
|
|
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
|
|
|
|
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for messages chains which have NO reactions
|
|
|
|
const USERNAMES = [
|
|
|
|
'vinny volcano\uD83C\uDF0B (伊胜焱)',
|
|
|
|
'Server Comp!',
|
|
|
|
'Make The Map \uD83D\uDDFA',
|
|
|
|
'1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
'Hatsune Miku',
|
|
|
|
'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
|
|
|
|
'shibe.mp4❄☃',
|
|
|
|
'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
'owner',
|
|
|
|
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
|
|
|
|
'Nicolaid',
|
|
|
|
'epbic',
|
|
|
|
];
|
2024-04-06 01:38:46 -07:00
|
|
|
|
|
|
|
async function main() {
|
|
|
|
let counter = 0;
|
|
|
|
try {
|
|
|
|
await fsProm.unlink('output.json');
|
|
|
|
} catch {}
|
|
|
|
|
|
|
|
const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
|
|
|
|
let lastMsgAuthor;
|
|
|
|
let lastMsgTime;
|
|
|
|
let botAuthoredMsgSequence;
|
|
|
|
let convoMsgSeqCount = 0;
|
2024-05-25 04:35:24 +00:00
|
|
|
let convoReactCount = 0;
|
2024-04-06 01:38:46 -07:00
|
|
|
let convoMsgs = [];
|
|
|
|
|
|
|
|
stream.on('data', async (msg) => {
|
|
|
|
// no bot/system messages
|
|
|
|
if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Replies are a tricky case. I considered pasting their context in, except this
|
|
|
|
* requires keeping a full message cache and won't scale. Another possibility is
|
|
|
|
* to maintain a small sliding window of message history and delete replies which
|
|
|
|
* reference a message too far in the past... but what if that reply gets replied
|
|
|
|
* to right after? Our chat in particular has a lot of these "necro" messages, but
|
|
|
|
* since they tend to spark further discussion if anything, they probably don't
|
|
|
|
* noticeably obfuscate the flow of conversation compared to normal time skips,
|
|
|
|
* which our model is incapable of picking up in the first place.
|
|
|
|
* TLDR: Keep the replies. Making too many assumptions is bad.
|
|
|
|
*/
|
|
|
|
|
|
|
|
// scrub links
|
2024-05-25 04:35:24 +00:00
|
|
|
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
|
|
|
|
// scrub @mentions
|
|
|
|
for (const username of USERNAMES) {
|
|
|
|
cleanContent = cleanContent.replaceAll(`@${username}`, "");
|
|
|
|
}
|
2024-04-06 01:38:46 -07:00
|
|
|
if (!cleanContent) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2024-05-25 04:35:24 +00:00
|
|
|
// count reaction
|
|
|
|
convoReactCount += msg.reactions.length;
|
|
|
|
|
2024-04-06 01:38:46 -07:00
|
|
|
// determine continuity of message sequences
|
|
|
|
let msgTime = new Date(msg.timestamp);
|
|
|
|
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
|
|
|
|
lastMsgAuthor = msg.author.id;
|
|
|
|
|
|
|
|
// bot will pretend to author a random number of msg sequences
|
|
|
|
botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
|
|
|
|
|
|
|
|
++convoMsgSeqCount;
|
|
|
|
}
|
|
|
|
lastMsgTime = msgTime;
|
|
|
|
|
|
|
|
// 10 msg sequences per "conversation"
|
|
|
|
if (convoMsgSeqCount === 10) {
|
2024-05-25 04:35:24 +00:00
|
|
|
// dropout
|
|
|
|
const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
|
|
|
|
if (convoKeep) {
|
|
|
|
// write JSONL format
|
|
|
|
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
|
|
|
|
}
|
|
|
|
convoMsgSeqCount = convoReactCount = 0;
|
2024-04-06 01:38:46 -07:00
|
|
|
convoMsgs = [];
|
|
|
|
}
|
|
|
|
|
|
|
|
// follow chatML chat template
|
|
|
|
const outMsg = {
|
2024-05-25 04:35:24 +00:00
|
|
|
role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
|
2024-04-06 01:38:46 -07:00
|
|
|
content: cleanContent
|
|
|
|
};
|
|
|
|
convoMsgs.push(outMsg);
|
|
|
|
|
|
|
|
if (++counter % 1000 === 0) {
|
|
|
|
console.log(counter + " messages written");
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
stream.on('close', async () => {
|
|
|
|
if (convoMsgs.length) {
|
|
|
|
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
|
|
|
|
}
|
|
|
|
console.log("Done!");
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
main();
|