JSONL generation script from DiscordChatExporter
This commit is contained in:
parent
b9bdf1b86e
commit
f0c43cb702
1
data/.gitignore
vendored
Normal file
1
data/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
node_modules/
|
40
data/package-lock.json
generated
Normal file
40
data/package-lock.json
generated
Normal file
@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "discord",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"JSONStream": "^1.3.5"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonparse": {
|
||||
"version": "1.3.1",
|
||||
"resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz",
|
||||
"integrity": "sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==",
|
||||
"engines": [
|
||||
"node >= 0.2.0"
|
||||
]
|
||||
},
|
||||
"node_modules/JSONStream": {
|
||||
"version": "1.3.5",
|
||||
"resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.5.tgz",
|
||||
"integrity": "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ==",
|
||||
"dependencies": {
|
||||
"jsonparse": "^1.2.0",
|
||||
"through": ">=2.2.7 <3"
|
||||
},
|
||||
"bin": {
|
||||
"JSONStream": "bin.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/through": {
|
||||
"version": "2.3.8",
|
||||
"resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
|
||||
"integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg=="
|
||||
}
|
||||
}
|
||||
}
|
5
data/package.json
Normal file
5
data/package.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"JSONStream": "^1.3.5"
|
||||
}
|
||||
}
|
83
data/process.js
Normal file
83
data/process.js
Normal file
@ -0,0 +1,83 @@
|
||||
const fs = require('node:fs');
|
||||
const JSONStream = require('JSONStream');
|
||||
|
||||
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
|
||||
|
||||
async function main() {
|
||||
let counter = 0;
|
||||
try {
|
||||
await fsProm.unlink('output.json');
|
||||
} catch {}
|
||||
|
||||
const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
|
||||
let lastMsgAuthor;
|
||||
let lastMsgTime;
|
||||
let botAuthoredMsgSequence;
|
||||
let convoMsgSeqCount = 0;
|
||||
let convoMsgs = [];
|
||||
|
||||
stream.on('data', async (msg) => {
|
||||
// no bot/system messages
|
||||
if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replies are a tricky case. I considered pasting their context in, except this
|
||||
* requires keeping a full message cache and won't scale. Another possibility is
|
||||
* to maintain a small sliding window of message history and delete replies which
|
||||
* reference a message too far in the past... but what if that reply gets replied
|
||||
* to right after? Our chat in particular has a lot of these "necro" messages, but
|
||||
* since they tend to spark further discussion if anything, they probably don't
|
||||
* noticeably obfuscate the flow of conversation compared to normal time skips,
|
||||
* which our model is incapable of picking up in the first place.
|
||||
* TLDR: Keep the replies. Making too many assumptions is bad.
|
||||
*/
|
||||
|
||||
// scrub links
|
||||
const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
|
||||
if (!cleanContent) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine continuity of message sequences
|
||||
let msgTime = new Date(msg.timestamp);
|
||||
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
|
||||
lastMsgAuthor = msg.author.id;
|
||||
|
||||
// bot will pretend to author a random number of msg sequences
|
||||
botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
|
||||
|
||||
++convoMsgSeqCount;
|
||||
}
|
||||
lastMsgTime = msgTime;
|
||||
|
||||
// 10 msg sequences per "conversation"
|
||||
if (convoMsgSeqCount === 10) {
|
||||
// write JSONL format
|
||||
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
|
||||
convoMsgSeqCount = 0;
|
||||
convoMsgs = [];
|
||||
}
|
||||
|
||||
// follow chatML chat template
|
||||
const outMsg = {
|
||||
role: botAuthoredMsgSequence ? "assistant" : "user",
|
||||
content: cleanContent
|
||||
};
|
||||
convoMsgs.push(outMsg);
|
||||
|
||||
if (++counter % 1000 === 0) {
|
||||
console.log(counter + " messages written");
|
||||
}
|
||||
});
|
||||
|
||||
stream.on('close', async () => {
|
||||
if (convoMsgs.length) {
|
||||
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
|
||||
}
|
||||
console.log("Done!");
|
||||
});
|
||||
}
|
||||
|
||||
main();
|
Loading…
x
Reference in New Issue
Block a user