JSONL generation script from DiscordChatExporter

2024-04-06 01:38:46 -07:00 · 2024-04-06 01:38:46 -07:00 · f0c43cb702
commit f0c43cb702
parent b9bdf1b86e
4 changed files with 129 additions and 0 deletions
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
+node_modules/
--- a/data/package-lock.json
+++ b/data/package-lock.json
@ -0,0 +1,40 @@
+{
+  "name": "discord",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "JSONStream": "^1.3.5"
+      }
+    },
+    "node_modules/jsonparse": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz",
+      "integrity": "sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==",
+      "engines": [
+        "node >= 0.2.0"
+      ]
+    },
+    "node_modules/JSONStream": {
+      "version": "1.3.5",
+      "resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.5.tgz",
+      "integrity": "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ==",
+      "dependencies": {
+        "jsonparse": "^1.2.0",
+        "through": ">=2.2.7 <3"
+      },
+      "bin": {
+        "JSONStream": "bin.js"
+      },
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/through": {
+      "version": "2.3.8",
+      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
+      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg=="
+    }
+  }
+}
--- a/data/package.json
+++ b/data/package.json
@ -0,0 +1,5 @@
+{
+  "dependencies": {
+    "JSONStream": "^1.3.5"
+  }
+}
--- a/data/process.js
+++ b/data/process.js
@ -0,0 +1,83 @@
+const fs = require('node:fs');
+const JSONStream = require('JSONStream');
+
+const MIKU_FREQ = 5;   // 1/5 = 20% of message chains are randomly chosen to be from Miku
+
+async function main() {
+    let counter = 0;
+    try {
+        await fsProm.unlink('output.json');
+    } catch {}
+
+    const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
+    let lastMsgAuthor;
+    let lastMsgTime;
+    let botAuthoredMsgSequence;
+    let convoMsgSeqCount = 0;
+    let convoMsgs = [];
+
+    stream.on('data', async (msg) => {
+        // no bot/system messages
+        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
+            return;
+        }
+
+        /**
+         * Replies are a tricky case. I considered pasting their context in, except this
+         * requires keeping a full message cache and won't scale. Another possibility is
+         * to maintain a small sliding window of message history and delete replies which
+         * reference a message too far in the past... but what if that reply gets replied
+         * to right after? Our chat in particular has a lot of these "necro" messages, but
+         * since they tend to spark further discussion if anything, they probably don't
+         * noticeably obfuscate the flow of conversation compared to normal time skips,
+         * which our model is incapable of picking up in the first place.
+         * TLDR: Keep the replies. Making too many assumptions is bad.
+         */
+
+        // scrub links
+        const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
+        if (!cleanContent) {
+            return;
+        }
+
+        // determine continuity of message sequences
+        let msgTime = new Date(msg.timestamp);
+        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
+            lastMsgAuthor = msg.author.id;
+
+            // bot will pretend to author a random number of msg sequences
+            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
+
+            ++convoMsgSeqCount;
+        }
+        lastMsgTime = msgTime;
+
+        // 10 msg sequences per "conversation"
+        if (convoMsgSeqCount === 10) {
+            // write JSONL format
+            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
+            convoMsgSeqCount = 0;
+            convoMsgs = [];
+        }
+
+        // follow chatML chat template
+        const outMsg = {
+            role: botAuthoredMsgSequence ? "assistant" : "user",
+            content: cleanContent
+        };
+        convoMsgs.push(outMsg);
+
+        if (++counter % 1000 === 0) {
+            console.log(counter + " messages written");
+        }
+    });
+
+    stream.on('close', async () => {
+        if (convoMsgs.length) {
+            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
+        }
+        console.log("Done!");
+    });
+}
+
+main();