JSONL generation script from DiscordChatExporter
This commit is contained in:
		
							parent
							
								
									b9bdf1b86e
								
							
						
					
					
						commit
						f0c43cb702
					
				
							
								
								
									
										1
									
								
								data/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								data/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					node_modules/
 | 
				
			||||||
							
								
								
									
										40
									
								
								data/package-lock.json
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								data/package-lock.json
									
									
									
										generated
									
									
									
										Normal file
									
								
							@ -0,0 +1,40 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					  "name": "discord",
 | 
				
			||||||
 | 
					  "lockfileVersion": 3,
 | 
				
			||||||
 | 
					  "requires": true,
 | 
				
			||||||
 | 
					  "packages": {
 | 
				
			||||||
 | 
					    "": {
 | 
				
			||||||
 | 
					      "dependencies": {
 | 
				
			||||||
 | 
					        "JSONStream": "^1.3.5"
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "node_modules/jsonparse": {
 | 
				
			||||||
 | 
					      "version": "1.3.1",
 | 
				
			||||||
 | 
					      "resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz",
 | 
				
			||||||
 | 
					      "integrity": "sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==",
 | 
				
			||||||
 | 
					      "engines": [
 | 
				
			||||||
 | 
					        "node >= 0.2.0"
 | 
				
			||||||
 | 
					      ]
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "node_modules/JSONStream": {
 | 
				
			||||||
 | 
					      "version": "1.3.5",
 | 
				
			||||||
 | 
					      "resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.5.tgz",
 | 
				
			||||||
 | 
					      "integrity": "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ==",
 | 
				
			||||||
 | 
					      "dependencies": {
 | 
				
			||||||
 | 
					        "jsonparse": "^1.2.0",
 | 
				
			||||||
 | 
					        "through": ">=2.2.7 <3"
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      "bin": {
 | 
				
			||||||
 | 
					        "JSONStream": "bin.js"
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      "engines": {
 | 
				
			||||||
 | 
					        "node": "*"
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    "node_modules/through": {
 | 
				
			||||||
 | 
					      "version": "2.3.8",
 | 
				
			||||||
 | 
					      "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
 | 
				
			||||||
 | 
					      "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg=="
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										5
									
								
								data/package.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								data/package.json
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					  "dependencies": {
 | 
				
			||||||
 | 
					    "JSONStream": "^1.3.5"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										83
									
								
								data/process.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										83
									
								
								data/process.js
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,83 @@
 | 
				
			|||||||
 | 
					const fs = require('node:fs');
 | 
				
			||||||
 | 
					const JSONStream = require('JSONStream');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const MIKU_FREQ = 5;   // 1/5 = 20% of message chains are randomly chosen to be from Miku
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					async function main() {
 | 
				
			||||||
 | 
					    let counter = 0;
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					        await fsProm.unlink('output.json');
 | 
				
			||||||
 | 
					    } catch {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const stream = fs.createReadStream('input.json').pipe(JSONStream.parse('messages.*'));
 | 
				
			||||||
 | 
					    let lastMsgAuthor;
 | 
				
			||||||
 | 
					    let lastMsgTime;
 | 
				
			||||||
 | 
					    let botAuthoredMsgSequence;
 | 
				
			||||||
 | 
					    let convoMsgSeqCount = 0;
 | 
				
			||||||
 | 
					    let convoMsgs = [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    stream.on('data', async (msg) => {
 | 
				
			||||||
 | 
					        // no bot/system messages
 | 
				
			||||||
 | 
					        if (msg.author.isBot || (msg.type !== "Default" && msg.type !== "Reply")) {
 | 
				
			||||||
 | 
					            return;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        /**
 | 
				
			||||||
 | 
					         * Replies are a tricky case. I considered pasting their context in, except this
 | 
				
			||||||
 | 
					         * requires keeping a full message cache and won't scale. Another possibility is
 | 
				
			||||||
 | 
					         * to maintain a small sliding window of message history and delete replies which
 | 
				
			||||||
 | 
					         * reference a message too far in the past... but what if that reply gets replied
 | 
				
			||||||
 | 
					         * to right after? Our chat in particular has a lot of these "necro" messages, but
 | 
				
			||||||
 | 
					         * since they tend to spark further discussion if anything, they probably don't
 | 
				
			||||||
 | 
					         * noticeably obfuscate the flow of conversation compared to normal time skips,
 | 
				
			||||||
 | 
					         * which our model is incapable of picking up in the first place.
 | 
				
			||||||
 | 
					         * TLDR: Keep the replies. Making too many assumptions is bad.
 | 
				
			||||||
 | 
					         */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // scrub links
 | 
				
			||||||
 | 
					        const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
 | 
				
			||||||
 | 
					        if (!cleanContent) {
 | 
				
			||||||
 | 
					            return;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // determine continuity of message sequences
 | 
				
			||||||
 | 
					        let msgTime = new Date(msg.timestamp);
 | 
				
			||||||
 | 
					        if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
 | 
				
			||||||
 | 
					            lastMsgAuthor = msg.author.id;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            // bot will pretend to author a random number of msg sequences
 | 
				
			||||||
 | 
					            botAuthoredMsgSequence = Math.floor(Math.random() * MIKU_FREQ) === 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            ++convoMsgSeqCount;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        lastMsgTime = msgTime;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // 10 msg sequences per "conversation"
 | 
				
			||||||
 | 
					        if (convoMsgSeqCount === 10) {
 | 
				
			||||||
 | 
					            // write JSONL format
 | 
				
			||||||
 | 
					            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
 | 
				
			||||||
 | 
					            convoMsgSeqCount = 0;
 | 
				
			||||||
 | 
					            convoMsgs = [];
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // follow chatML chat template
 | 
				
			||||||
 | 
					        const outMsg = {
 | 
				
			||||||
 | 
					            role: botAuthoredMsgSequence ? "assistant" : "user",
 | 
				
			||||||
 | 
					            content: cleanContent
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					        convoMsgs.push(outMsg);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (++counter % 1000 === 0) {
 | 
				
			||||||
 | 
					            console.log(counter + " messages written");
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    stream.on('close', async () => {
 | 
				
			||||||
 | 
					        if (convoMsgs.length) {
 | 
				
			||||||
 | 
					            fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        console.log("Done!");
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					main();
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user