Add tts

2024-05-05 01:16:35 -07:00 · 2024-05-05 01:16:35 -07:00 · c990740219
commit c990740219
parent f840a7f321
4 changed files with 58 additions and 44 deletions
--- a/api.py
+++ b/api.py
@ -1,7 +1,8 @@
 from fastapi import FastAPI, File, Query, Response, UploadFile
 from fastapi.encoders import jsonable_encoder
 from fastapi.responses import FileResponse
-from config import TOKEN
+from config import EDGETTS_VOICE, TOKEN
 import edge_tts
 import hmac
 import model
 from pydantic import BaseModel
@ -47,6 +48,24 @@ async def rvc(token: str,
    return FileResponse(ai_vocals_path)
@app.post("/tts")
 async def tts(token: str,
    text: str,
    response: Response,
    pitch_change_oct: Annotated[int, Query()] = 1,
    pitch_change_sem: Annotated[int, Query()] = 0):
    if not hmac.compare_digest(token, TOKEN):
        response.status_code = 401
        return {"error": "Bad token"}
    with tempfile.NamedTemporaryFile() as tmp:
        communicate = edge_tts.Communicate(text, EDGETTS_VOICE)
        await communicate.save(tmp.name)
        ai_vocals_path = song_cover_pipeline(tmp.name, pitch_change_oct, voice_model='miku', pitch_change_sem=pitch_change_sem)
    return FileResponse(ai_vocals_path)
@app.get("/ping")
 def ping():
    return {"message": "pong"}
--- a/config_example.py
+++ b/config_example.py
@ -1 +1,2 @@
 TOKEN = "4BJkM2NO3Y6wFDXuHdKc"
 EDGETTS_VOICE = "en-US-AriaNeural"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 #deemix
 edge-tts==6.1.11
 fairseq==0.12.2
 faiss-cpu==1.7.3
 fastapi==0.110.0
--- a/rvc/rvc.ipynb
+++ b/rvc/rvc.ipynb
@ -2,16 +2,19 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
-     "name": "stderr",
+     "ename": "ImportError",
-     "output_type": "stream",
+     "evalue": "attempted relative import with no known parent package",
-     "text": [
+     "output_type": "error",
-      "2024-03-31 05:59:31 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n",
+     "traceback": [
-      "2024-03-31 05:59:32 | INFO | faiss.loader | Loading faiss with AVX2 support.\n",
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "2024-03-31 05:59:32 | INFO | faiss.loader | Successfully loaded faiss with AVX2 support.\n"
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m song_cover_pipeline\n",
      "File \u001b[0;32m~/MikuAI/rvc/main.py:9\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msox\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m#from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;66;03m#from pedalboard.io import AudioFile\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrvc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Config, load_hubert, get_vc, rvc_infer\n\u001b[1;32m     11\u001b[0m BASE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mdirname(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(\u001b[38;5;18m__file__\u001b[39m))\n\u001b[1;32m     13\u001b[0m rvc_models_dir \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(BASE_DIR, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrvc_models\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
     ]
    }
   ],
@ -40,45 +43,35 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import edge_tts\n",
    "voice = 'en-US-AriaNeural'\n",
    "\n",
    "async def tts():\n",
    "    communicate = edge_tts.Communicate('vinnybobinny', voice)\n",
    "    await communicate.save('../audio/input.ogg')\n",
    "\n",
    "await tts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
+     "ename": "NameError",
-     "output_type": "stream",
+     "evalue": "name 'song_cover_pipeline' is not defined",
-     "text": [
+     "output_type": "error",
-      "Output file path will be: ../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav\n",
+     "traceback": [
-      "[~] Converting voice using RVC...\n"
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msong_cover_pipeline\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../audio/input.ogg\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'song_cover_pipeline' is not defined"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/jshiffer/rvc\n",
      "2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}\n",
      "2024-03-31 05:59:45 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}\n",
      "/opt/conda/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
      "  warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gin_channels: 256 self.spk_embed_dim: 109\n",
      "<All keys matched successfully>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
@ -102,7 +95,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.9"
  }
 },
 "nbformat": 4,
`@ -1 +1,2 @@`
	`TOKEN = "4BJkM2NO3Y6wFDXuHdKc"`	`TOKEN = "4BJkM2NO3Y6wFDXuHdKc"`
		`EDGETTS_VOICE = "en-US-AriaNeural"`