This commit is contained in:
James S 2024-05-05 01:16:35 -07:00
parent f840a7f321
commit c990740219
4 changed files with 58 additions and 44 deletions

21
api.py
View File

@ -1,7 +1,8 @@
from fastapi import FastAPI, File, Query, Response, UploadFile from fastapi import FastAPI, File, Query, Response, UploadFile
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from config import TOKEN from config import EDGETTS_VOICE, TOKEN
import edge_tts
import hmac import hmac
import model import model
from pydantic import BaseModel from pydantic import BaseModel
@ -47,6 +48,24 @@ async def rvc(token: str,
return FileResponse(ai_vocals_path) return FileResponse(ai_vocals_path)
@app.post("/tts")
async def tts(token: str,
text: str,
response: Response,
pitch_change_oct: Annotated[int, Query()] = 1,
pitch_change_sem: Annotated[int, Query()] = 0):
if not hmac.compare_digest(token, TOKEN):
response.status_code = 401
return {"error": "Bad token"}
with tempfile.NamedTemporaryFile() as tmp:
communicate = edge_tts.Communicate(text, EDGETTS_VOICE)
await communicate.save(tmp.name)
ai_vocals_path = song_cover_pipeline(tmp.name, pitch_change_oct, voice_model='miku', pitch_change_sem=pitch_change_sem)
return FileResponse(ai_vocals_path)
@app.get("/ping") @app.get("/ping")
def ping(): def ping():
return {"message": "pong"} return {"message": "pong"}

View File

@ -1 +1,2 @@
TOKEN = "4BJkM2NO3Y6wFDXuHdKc" TOKEN = "4BJkM2NO3Y6wFDXuHdKc"
EDGETTS_VOICE = "en-US-AriaNeural"

View File

@ -1,4 +1,5 @@
#deemix #deemix
edge-tts==6.1.11
fairseq==0.12.2 fairseq==0.12.2
faiss-cpu==1.7.3 faiss-cpu==1.7.3
fastapi==0.110.0 fastapi==0.110.0

View File

@ -2,16 +2,19 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stderr", "ename": "ImportError",
"output_type": "stream", "evalue": "attempted relative import with no known parent package",
"text": [ "output_type": "error",
"2024-03-31 05:59:31 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n", "traceback": [
"2024-03-31 05:59:32 | INFO | faiss.loader | Loading faiss with AVX2 support.\n", "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"2024-03-31 05:59:32 | INFO | faiss.loader | Successfully loaded faiss with AVX2 support.\n" "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m song_cover_pipeline\n",
"File \u001b[0;32m~/MikuAI/rvc/main.py:9\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msox\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m#from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m#from pedalboard.io import AudioFile\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrvc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Config, load_hubert, get_vc, rvc_infer\n\u001b[1;32m 11\u001b[0m BASE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mdirname(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(\u001b[38;5;18m__file__\u001b[39m))\n\u001b[1;32m 13\u001b[0m rvc_models_dir \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(BASE_DIR, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrvc_models\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
"\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
] ]
} }
], ],
@ -40,45 +43,35 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import edge_tts\n",
"voice = 'en-US-AriaNeural'\n",
"\n",
"async def tts():\n",
" communicate = edge_tts.Communicate('vinnybobinny', voice)\n",
" await communicate.save('../audio/input.ogg')\n",
"\n",
"await tts()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "ename": "NameError",
"output_type": "stream", "evalue": "name 'song_cover_pipeline' is not defined",
"text": [ "output_type": "error",
"Output file path will be: ../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav\n", "traceback": [
"[~] Converting voice using RVC...\n" "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msong_cover_pipeline\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../audio/input.ogg\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n",
"\u001b[0;31mNameError\u001b[0m: name 'song_cover_pipeline' is not defined"
] ]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/jshiffer/rvc\n",
"2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}\n",
"2024-03-31 05:59:45 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}\n",
"/opt/conda/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
" warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"gin_channels: 256 self.spk_embed_dim: 109\n",
"<All keys matched successfully>\n"
]
},
{
"data": {
"text/plain": [
"'../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
@ -102,7 +95,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.14" "version": "3.10.9"
} }
}, },
"nbformat": 4, "nbformat": 4,