diff --git a/api.py b/api.py index cf9e213..281ad53 100644 --- a/api.py +++ b/api.py @@ -1,7 +1,8 @@ from fastapi import FastAPI, File, Query, Response, UploadFile from fastapi.encoders import jsonable_encoder from fastapi.responses import FileResponse -from config import TOKEN +from config import EDGETTS_VOICE, TOKEN +import edge_tts import hmac import model from pydantic import BaseModel @@ -47,6 +48,24 @@ async def rvc(token: str, return FileResponse(ai_vocals_path) +@app.post("/tts") +async def tts(token: str, + text: str, + response: Response, + pitch_change_oct: Annotated[int, Query()] = 1, + pitch_change_sem: Annotated[int, Query()] = 0): + if not hmac.compare_digest(token, TOKEN): + response.status_code = 401 + return {"error": "Bad token"} + + with tempfile.NamedTemporaryFile() as tmp: + communicate = edge_tts.Communicate(text, EDGETTS_VOICE) + await communicate.save(tmp.name) + ai_vocals_path = song_cover_pipeline(tmp.name, pitch_change_oct, voice_model='miku', pitch_change_sem=pitch_change_sem) + + return FileResponse(ai_vocals_path) + + @app.get("/ping") def ping(): return {"message": "pong"} diff --git a/config_example.py b/config_example.py index 102f41a..6a27581 100644 --- a/config_example.py +++ b/config_example.py @@ -1 +1,2 @@ TOKEN = "4BJkM2NO3Y6wFDXuHdKc" +EDGETTS_VOICE = "en-US-AriaNeural" diff --git a/requirements.txt b/requirements.txt index 38927bb..4405118 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ #deemix +edge-tts==6.1.11 fairseq==0.12.2 faiss-cpu==1.7.3 fastapi==0.110.0 diff --git a/rvc/rvc.ipynb b/rvc/rvc.ipynb index 01246ff..53d41e8 100644 --- a/rvc/rvc.ipynb +++ b/rvc/rvc.ipynb @@ -2,16 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-03-31 05:59:31 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n", - "2024-03-31 05:59:32 | INFO | faiss.loader | Loading faiss with AVX2 support.\n", - "2024-03-31 05:59:32 | INFO | faiss.loader | Successfully loaded faiss with AVX2 support.\n" + "ename": "ImportError", + "evalue": "attempted relative import with no known parent package", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m song_cover_pipeline\n", + "File \u001b[0;32m~/MikuAI/rvc/main.py:9\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msox\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m#from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m#from pedalboard.io import AudioFile\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrvc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Config, load_hubert, get_vc, rvc_infer\n\u001b[1;32m 11\u001b[0m BASE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mdirname(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(\u001b[38;5;18m__file__\u001b[39m))\n\u001b[1;32m 13\u001b[0m rvc_models_dir \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(BASE_DIR, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrvc_models\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" ] } ], @@ -40,45 +43,35 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import edge_tts\n", + "voice = 'en-US-AriaNeural'\n", + "\n", + "async def tts():\n", + " communicate = edge_tts.Communicate('vinnybobinny', voice)\n", + " await communicate.save('../audio/input.ogg')\n", + "\n", + "await tts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output file path will be: ../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav\n", - "[~] Converting voice using RVC...\n" + "ename": "NameError", + "evalue": "name 'song_cover_pipeline' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msong_cover_pipeline\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../audio/input.ogg\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'song_cover_pipeline' is not defined" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/jshiffer/rvc\n", - "2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}\n", - "2024-03-31 05:59:45 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}\n", - "/opt/conda/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", - " warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gin_channels: 256 self.spk_embed_dim: 109\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "'../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -102,7 +95,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.9" } }, "nbformat": 4,