Add tts
This commit is contained in:
parent
f840a7f321
commit
c990740219
21
api.py
21
api.py
@ -1,7 +1,8 @@
|
|||||||
from fastapi import FastAPI, File, Query, Response, UploadFile
|
from fastapi import FastAPI, File, Query, Response, UploadFile
|
||||||
from fastapi.encoders import jsonable_encoder
|
from fastapi.encoders import jsonable_encoder
|
||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
from config import TOKEN
|
from config import EDGETTS_VOICE, TOKEN
|
||||||
|
import edge_tts
|
||||||
import hmac
|
import hmac
|
||||||
import model
|
import model
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -47,6 +48,24 @@ async def rvc(token: str,
|
|||||||
|
|
||||||
return FileResponse(ai_vocals_path)
|
return FileResponse(ai_vocals_path)
|
||||||
|
|
||||||
|
@app.post("/tts")
|
||||||
|
async def tts(token: str,
|
||||||
|
text: str,
|
||||||
|
response: Response,
|
||||||
|
pitch_change_oct: Annotated[int, Query()] = 1,
|
||||||
|
pitch_change_sem: Annotated[int, Query()] = 0):
|
||||||
|
if not hmac.compare_digest(token, TOKEN):
|
||||||
|
response.status_code = 401
|
||||||
|
return {"error": "Bad token"}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile() as tmp:
|
||||||
|
communicate = edge_tts.Communicate(text, EDGETTS_VOICE)
|
||||||
|
await communicate.save(tmp.name)
|
||||||
|
ai_vocals_path = song_cover_pipeline(tmp.name, pitch_change_oct, voice_model='miku', pitch_change_sem=pitch_change_sem)
|
||||||
|
|
||||||
|
return FileResponse(ai_vocals_path)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/ping")
|
@app.get("/ping")
|
||||||
def ping():
|
def ping():
|
||||||
return {"message": "pong"}
|
return {"message": "pong"}
|
||||||
|
@ -1 +1,2 @@
|
|||||||
TOKEN = "4BJkM2NO3Y6wFDXuHdKc"
|
TOKEN = "4BJkM2NO3Y6wFDXuHdKc"
|
||||||
|
EDGETTS_VOICE = "en-US-AriaNeural"
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#deemix
|
#deemix
|
||||||
|
edge-tts==6.1.11
|
||||||
fairseq==0.12.2
|
fairseq==0.12.2
|
||||||
faiss-cpu==1.7.3
|
faiss-cpu==1.7.3
|
||||||
fastapi==0.110.0
|
fastapi==0.110.0
|
||||||
|
@ -2,16 +2,19 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stderr",
|
"ename": "ImportError",
|
||||||
"output_type": "stream",
|
"evalue": "attempted relative import with no known parent package",
|
||||||
"text": [
|
"output_type": "error",
|
||||||
"2024-03-31 05:59:31 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n",
|
"traceback": [
|
||||||
"2024-03-31 05:59:32 | INFO | faiss.loader | Loading faiss with AVX2 support.\n",
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
"2024-03-31 05:59:32 | INFO | faiss.loader | Successfully loaded faiss with AVX2 support.\n"
|
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m song_cover_pipeline\n",
|
||||||
|
"File \u001b[0;32m~/MikuAI/rvc/main.py:9\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msox\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m#from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m#from pedalboard.io import AudioFile\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrvc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Config, load_hubert, get_vc, rvc_infer\n\u001b[1;32m 11\u001b[0m BASE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mdirname(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(\u001b[38;5;18m__file__\u001b[39m))\n\u001b[1;32m 13\u001b[0m rvc_models_dir \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(BASE_DIR, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrvc_models\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
||||||
|
"\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -40,45 +43,35 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import edge_tts\n",
|
||||||
|
"voice = 'en-US-AriaNeural'\n",
|
||||||
|
"\n",
|
||||||
|
"async def tts():\n",
|
||||||
|
" communicate = edge_tts.Communicate('vinnybobinny', voice)\n",
|
||||||
|
" await communicate.save('../audio/input.ogg')\n",
|
||||||
|
"\n",
|
||||||
|
"await tts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"ename": "NameError",
|
||||||
"output_type": "stream",
|
"evalue": "name 'song_cover_pipeline' is not defined",
|
||||||
"text": [
|
"output_type": "error",
|
||||||
"Output file path will be: ../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav\n",
|
"traceback": [
|
||||||
"[~] Converting voice using RVC...\n"
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msong_cover_pipeline\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../audio/input.ogg\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n",
|
||||||
|
"\u001b[0;31mNameError\u001b[0m: name 'song_cover_pipeline' is not defined"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/jshiffer/rvc\n",
|
|
||||||
"2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}\n",
|
|
||||||
"2024-03-31 05:59:45 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}\n",
|
|
||||||
"/opt/conda/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n",
|
|
||||||
" warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"gin_channels: 256 self.spk_embed_dim: 109\n",
|
|
||||||
"<All keys matched successfully>\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
@ -102,7 +95,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.14"
|
"version": "3.10.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user