From f0c43cb7024caf6f96de01f4df3c29458ea841f1 Mon Sep 17 00:00:00 2001
From: James Shiffer <jshiffer@ucla.edu>
Date: Sat, 6 Apr 2024 01:38:46 -0700
Subject: [PATCH] JSONL generation script from DiscordChatExporter

---
 data/.gitignore        |  1 +
 data/package-lock.json | 40 ++++++++++++++++++++
 data/package.json      |  5 +++
 data/process.js        | 83 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 129 insertions(+)
 create mode 100644 data/.gitignore
 create mode 100644 data/package-lock.json
 create mode 100644 data/package.json
 create mode 100644 data/process.js

diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..40b878d
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1 @@
+node_modules/
\ No newline at end of file
diff --git a/data/package-lock.json b/data/package-lock.json
new file mode 100644
index 0000000..0e9f8e4
--- /dev/null
+++ b/data/package-lock.json
@@ -0,0 +1,40 @@
+{
+  "name": "discord",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "JSONStream": "^1.3.5"
+      }
+    },
+    "node_modules/jsonparse": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz",
+      "integrity": "sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==",
+      "engines": [
+        "node >= 0.2.0"
+      ]
+    },
+    "node_modules/JSONStream": {
+      "version": "1.3.5",
+      "resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.5.tgz",
+      "integrity": "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ==",
+      "dependencies": {
+        "jsonparse": "^1.2.0",
+        "through": ">=2.2.7 <3"
+      },
+      "bin": {
+        "JSONStream": "bin.js"
+      },
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/through": {
+      "version": "2.3.8",
+      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
+      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg=="
+    }
+  }
+}
diff --git a/data/package.json b/data/package.json
new file mode 100644
index 0000000..8b56545
--- /dev/null
+++ b/data/package.json
@@ -0,0 +1,5 @@
+{
+  "dependencies": {
+    "JSONStream": "^1.3.5"
+  }
+}
diff --git a/data/process.js b/data/process.js
new file mode 100644
index 0000000..1fa36b8
--- /dev/null
+++ b/data/process.js
@@ -0,0 +1,83 @@
+const fs = require('node:fs');
+const JSONStream = require('JSONStream');
+
+const MIKU_FREQ = 5;   // 1/5 = 20% of message chains are randomly chosen to be from Miku
+
+async function main() {
+    let counter = 0;
+    try {
+        await fsProm.unlink('output.json');
+    } catch {}
+
+    const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
+    let lastMsgAuthor;
+    let lastMsgTime;
+    let botAuthoredMsgSequence;
+    let convoMsgSeqCount = 0;
+    let convoMsgs = [];
+
+    stream.on('data', async (msg) => {
+        // no bot/system messages
+        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
+            return;
+        }
+
+        /**
+         * Replies are a tricky case. I considered pasting their context in, except this
+         * requires keeping a full message cache and won't scale. Another possibility is
+         * to maintain a small sliding window of message history and delete replies which
+         * reference a message too far in the past... but what if that reply gets replied
+         * to right after? Our chat in particular has a lot of these "necro" messages, but
+         * since they tend to spark further discussion if anything, they probably don't
+         * noticeably obfuscate the flow of conversation compared to normal time skips,
+         * which our model is incapable of picking up in the first place.
+         * TLDR: Keep the replies. Making too many assumptions is bad.
+         */
+
+        // scrub links
+        const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
+        if (!cleanContent) {
+            return;
+        }
+
+        // determine continuity of message sequences
+        let msgTime = new Date(msg.timestamp);
+        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
+            lastMsgAuthor = msg.author.id;
+
+            // bot will pretend to author a random number of msg sequences
+            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
+
+            ++convoMsgSeqCount;
+        }
+        lastMsgTime = msgTime;
+
+        // 10 msg sequences per "conversation"
+        if (convoMsgSeqCount === 10) {
+            // write JSONL format
+            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
+            convoMsgSeqCount = 0;
+            convoMsgs = [];
+        }
+
+        // follow chatML chat template
+        const outMsg = {
+            role: botAuthoredMsgSequence ? "assistant" : "user",
+            content: cleanContent
+        };
+        convoMsgs.push(outMsg);
+
+        if (++counter % 1000 === 0) {
+            console.log(counter + " messages written");
+        }
+    });
+
+    stream.on('close', async () => {
+        if (convoMsgs.length) {
+            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
+        }
+        console.log("Done!");
+    });
+}
+
+main();