Merge branch 'main' of git.femboyfinancial.jp:james/MikuAI

This commit is contained in:
James S 2024-06-01 16:28:51 -07:00
commit b73b88de4d
3 changed files with 58 additions and 10075 deletions

22
data/procToxicQA.js Normal file
View File

@ -0,0 +1,22 @@
const fs = require('node:fs');
var lineReader = require('readline').createInterface({
input: fs.createReadStream('toxicQA.json')
});
var outstream = fs.createWriteStream('toxicQAfinal.json');
fs.unlinkSync('toxicQAfinal.json');
lineReader.on('line', function (line) {
const dialogue = JSON.parse(line)["conversations"];
const newdialogue = [];
for (const dialogueLine of dialogue) {
newdialogue.push({
role: dialogueLine["from"] === "human" ? "user" : "assistant",
content: dialogueLine["value"]
});
}
outstream.write(JSON.stringify(newdialogue) + '\n');
});
lineReader.on('close', function () {
console.log('all done, son');
});

View File

@ -1,7 +1,24 @@
const fs = require('node:fs'); const fs = require('node:fs');
const JSONStream = require('JSONStream'); const JSONStream = require('JSONStream');
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for messages chains which have NO reactions
const USERNAMES = [
'vinny volcano\uD83C\uDF0B (伊胜焱)',
'Server Comp!',
'Make The Map \uD83D\uDDFA',
'1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
'Hatsune Miku',
'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
'shibe.mp4❄☃',
'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
'owner',
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
'Nicolaid',
'epbic',
];
async function main() { async function main() {
let counter = 0; let counter = 0;
@ -14,6 +31,7 @@ async function main() {
let lastMsgTime; let lastMsgTime;
let botAuthoredMsgSequence; let botAuthoredMsgSequence;
let convoMsgSeqCount = 0; let convoMsgSeqCount = 0;
let convoReactCount = 0;
let convoMsgs = []; let convoMsgs = [];
stream.on('data', async (msg) => { stream.on('data', async (msg) => {
@ -35,11 +53,18 @@ async function main() {
*/ */
// scrub links // scrub links
const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, ''); let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
// scrub @mentions
for (const username of USERNAMES) {
cleanContent = cleanContent.replaceAll(`@${username}`, "");
}
if (!cleanContent) { if (!cleanContent) {
return; return;
} }
// count reaction
convoReactCount += msg.reactions.length;
// determine continuity of message sequences // determine continuity of message sequences
let msgTime = new Date(msg.timestamp); let msgTime = new Date(msg.timestamp);
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) { if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
@ -54,15 +79,19 @@ async function main() {
// 10 msg sequences per "conversation" // 10 msg sequences per "conversation"
if (convoMsgSeqCount === 10) { if (convoMsgSeqCount === 10) {
// write JSONL format // dropout
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n'); const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
convoMsgSeqCount = 0; if (convoKeep) {
// write JSONL format
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
}
convoMsgSeqCount = convoReactCount = 0;
convoMsgs = []; convoMsgs = [];
} }
// follow chatML chat template // follow chatML chat template
const outMsg = { const outMsg = {
role: botAuthoredMsgSequence ? "assistant" : "user", role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
content: cleanContent content: cleanContent
}; };
convoMsgs.push(outMsg); convoMsgs.push(outMsg);

File diff suppressed because one or more lines are too long