Merge branch 'main' of git.femboyfinancial.jp:james/MikuAI
This commit is contained in:
commit
b73b88de4d
22
data/procToxicQA.js
Normal file
22
data/procToxicQA.js
Normal file
@ -0,0 +1,22 @@
|
||||
const fs = require('node:fs');
|
||||
var lineReader = require('readline').createInterface({
|
||||
input: fs.createReadStream('toxicQA.json')
|
||||
});
|
||||
var outstream = fs.createWriteStream('toxicQAfinal.json');
|
||||
fs.unlinkSync('toxicQAfinal.json');
|
||||
|
||||
lineReader.on('line', function (line) {
|
||||
const dialogue = JSON.parse(line)["conversations"];
|
||||
const newdialogue = [];
|
||||
for (const dialogueLine of dialogue) {
|
||||
newdialogue.push({
|
||||
role: dialogueLine["from"] === "human" ? "user" : "assistant",
|
||||
content: dialogueLine["value"]
|
||||
});
|
||||
}
|
||||
outstream.write(JSON.stringify(newdialogue) + '\n');
|
||||
});
|
||||
|
||||
lineReader.on('close', function () {
|
||||
console.log('all done, son');
|
||||
});
|
@ -1,7 +1,24 @@
|
||||
const fs = require('node:fs');
|
||||
const JSONStream = require('JSONStream');
|
||||
|
||||
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
|
||||
const MIKU_FREQ = 5; // 1/5 = 20% of message chains are randomly chosen to be from Miku
|
||||
const DROPOUT_UNFUNNY = 0.75; // 75% dropout rate for messages chains which have NO reactions
|
||||
const USERNAMES = [
|
||||
'vinny volcano\uD83C\uDF0B (伊胜焱)',
|
||||
'Server Comp!',
|
||||
'Make The Map \uD83D\uDDFA',
|
||||
'1981 Celical Man\uD83C\uDF41\uD83C\uDF42',
|
||||
'Hatsune Miku',
|
||||
'Cutie Kazerounian\uD83C\uDF41\uD83C\uDF42',
|
||||
'David Pan (Fembooru)\uD83C\uDF41\uD83C\uDF42',
|
||||
'Exiled Sammy \uD83D\uDD12\uD83C\uDFDD⏱',
|
||||
'shibe.mp4❄☃',
|
||||
'Today Man-San(1990)\uD83C\uDF41\uD83C\uDF42',
|
||||
'owner',
|
||||
'cj7 by stephen chow (gmod PC)\uD83C\uDF41\uD83C\uDF42',
|
||||
'Nicolaid',
|
||||
'epbic',
|
||||
];
|
||||
|
||||
async function main() {
|
||||
let counter = 0;
|
||||
@ -14,6 +31,7 @@ async function main() {
|
||||
let lastMsgTime;
|
||||
let botAuthoredMsgSequence;
|
||||
let convoMsgSeqCount = 0;
|
||||
let convoReactCount = 0;
|
||||
let convoMsgs = [];
|
||||
|
||||
stream.on('data', async (msg) => {
|
||||
@ -35,11 +53,18 @@ async function main() {
|
||||
*/
|
||||
|
||||
// scrub links
|
||||
const cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
|
||||
let cleanContent = msg.content.replaceAll(/https?:\/\/\S+/gi, '');
|
||||
// scrub @mentions
|
||||
for (const username of USERNAMES) {
|
||||
cleanContent = cleanContent.replaceAll(`@${username}`, "");
|
||||
}
|
||||
if (!cleanContent) {
|
||||
return;
|
||||
}
|
||||
|
||||
// count reaction
|
||||
convoReactCount += msg.reactions.length;
|
||||
|
||||
// determine continuity of message sequences
|
||||
let msgTime = new Date(msg.timestamp);
|
||||
if (lastMsgAuthor !== msg.author.id || (msgTime - lastMsgTime)/60000 >= 7) {
|
||||
@ -54,15 +79,19 @@ async function main() {
|
||||
|
||||
// 10 msg sequences per "conversation"
|
||||
if (convoMsgSeqCount === 10) {
|
||||
// write JSONL format
|
||||
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
|
||||
convoMsgSeqCount = 0;
|
||||
// dropout
|
||||
const convoKeep = convoReactCount > 0 || Math.random() >= DROPOUT_UNFUNNY;
|
||||
if (convoKeep) {
|
||||
// write JSONL format
|
||||
fs.appendFileSync('output.json', JSON.stringify(convoMsgs) + '\n');
|
||||
}
|
||||
convoMsgSeqCount = convoReactCount = 0;
|
||||
convoMsgs = [];
|
||||
}
|
||||
|
||||
// follow chatML chat template
|
||||
const outMsg = {
|
||||
role: botAuthoredMsgSequence ? "assistant" : "user",
|
||||
role: botAuthoredMsgSequence ? "assistant" : msg.author.name,
|
||||
content: cleanContent
|
||||
};
|
||||
convoMsgs.push(outMsg);
|
||||
|
10070
train_unsloth.ipynb
10070
train_unsloth.ipynb
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user