diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..40b878d --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +node_modules/ \ No newline at end of file diff --git a/data/package-lock.json b/data/package-lock.json new file mode 100644 index 0000000..0e9f8e4 --- /dev/null +++ b/data/package-lock.json @@ -0,0 +1,40 @@ +{ + "name": "discord", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "dependencies": { + "JSONStream": "^1.3.5" + } + }, + "node_modules/jsonparse": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz", + "integrity": "sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==", + "engines": [ + "node >= 0.2.0" + ] + }, + "node_modules/JSONStream": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.5.tgz", + "integrity": "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ==", + "dependencies": { + "jsonparse": "^1.2.0", + "through": ">=2.2.7 <3" + }, + "bin": { + "JSONStream": "bin.js" + }, + "engines": { + "node": "*" + } + }, + "node_modules/through": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", + "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==" + } + } +} diff --git a/data/package.json b/data/package.json new file mode 100644 index 0000000..8b56545 --- /dev/null +++ b/data/package.json @@ -0,0 +1,5 @@ +{ + "dependencies": { + "JSONStream": "^1.3.5" + } +} diff --git a/data/process.js b/data/process.js new file mode 100644 index 0000000..1fa36b8 --- /dev/null +++ b/data/process.js @@ -0,0 +1,83 @@ +const fs = require('node:fs'); +const JSONStream = require('JSONStream'); + +const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku + +async function main() { + let counter = 0; + try { + await fsProm.unlink('output.json'); + } catch {} + + const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*')); + let lastMsgAuthor; + let lastMsgTime; + let botAuthoredMsgSequence; + let convoMsgSeqCount = 0; + let convoMsgs = []; + + stream.on('data', async (msg) => { + // no bot/system messages + if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) { + return; + } + + /** + * Replies are a tricky case. I considered pasting their context in, except this + * requires keeping a full message cache and won't scale. Another possibility is + * to maintain a small sliding window of message history and delete replies which + * reference a message too far in the past... but what if that reply gets replied + * to right after? Our chat in particular has a lot of these "necro" messages, but + * since they tend to spark further discussion if anything, they probably don't + * noticeably obfuscate the flow of conversation compared to normal time skips, + * which our model is incapable of picking up in the first place. + * TLDR: Keep the replies. Making too many assumptions is bad. + */ + + // scrub links + const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); + if (!cleanContent) { + return; + } + + // determine continuity of message sequences + let msgTime = new Date(msg.timestamp); + if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { + lastMsgAuthor = msg.author.id; + + // bot will pretend to author a random number of msg sequences + botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0; + + ++convoMsgSeqCount; + } + lastMsgTime = msgTime; + + // 10 msg sequences per "conversation" + if (convoMsgSeqCount === 10) { + // write JSONL format + fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); + convoMsgSeqCount = 0; + convoMsgs = []; + } + + // follow chatML chat template + const outMsg = { + role: botAuthoredMsgSequence ? "assistant" : "user", + content: cleanContent + }; + convoMsgs.push(outMsg); + + if (++counter % 1000 === 0) { + console.log(counter + " messages written"); + } + }); + + stream.on('close', async () => { + if (convoMsgs.length) { + fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); + } + console.log("Done!"); + }); +} + +main();