Add tts
This commit is contained in:
		
							parent
							
								
									f840a7f321
								
							
						
					
					
						commit
						c990740219
					
				
							
								
								
									
										21
									
								
								api.py
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								api.py
									
									
									
									
									
								
							| @ -1,7 +1,8 @@ | ||||
| from fastapi import FastAPI, File, Query, Response, UploadFile | ||||
| from fastapi.encoders import jsonable_encoder | ||||
| from fastapi.responses import FileResponse | ||||
| from config import TOKEN | ||||
| from config import EDGETTS_VOICE, TOKEN | ||||
| import edge_tts | ||||
| import hmac | ||||
| import model | ||||
| from pydantic import BaseModel | ||||
| @ -47,6 +48,24 @@ async def rvc(token: str, | ||||
| 
 | ||||
|     return FileResponse(ai_vocals_path) | ||||
| 
 | ||||
| @app.post("/tts") | ||||
| async def tts(token: str, | ||||
|     text: str, | ||||
|     response: Response, | ||||
|     pitch_change_oct: Annotated[int, Query()] = 1, | ||||
|     pitch_change_sem: Annotated[int, Query()] = 0): | ||||
|     if not hmac.compare_digest(token, TOKEN): | ||||
|         response.status_code = 401 | ||||
|         return {"error": "Bad token"} | ||||
|      | ||||
|     with tempfile.NamedTemporaryFile() as tmp: | ||||
|         communicate = edge_tts.Communicate(text, EDGETTS_VOICE) | ||||
|         await communicate.save(tmp.name) | ||||
|         ai_vocals_path = song_cover_pipeline(tmp.name, pitch_change_oct, voice_model='miku', pitch_change_sem=pitch_change_sem) | ||||
| 
 | ||||
|     return FileResponse(ai_vocals_path) | ||||
| 
 | ||||
| 
 | ||||
| @app.get("/ping") | ||||
| def ping(): | ||||
|     return {"message": "pong"} | ||||
|  | ||||
| @ -1 +1,2 @@ | ||||
| TOKEN = "4BJkM2NO3Y6wFDXuHdKc" | ||||
| EDGETTS_VOICE = "en-US-AriaNeural" | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| #deemix | ||||
| edge-tts==6.1.11 | ||||
| fairseq==0.12.2 | ||||
| faiss-cpu==1.7.3 | ||||
| fastapi==0.110.0 | ||||
|  | ||||
| @ -2,16 +2,19 @@ | ||||
|  "cells": [ | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 1, | ||||
|    "execution_count": 12, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "2024-03-31 05:59:31 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n", | ||||
|       "2024-03-31 05:59:32 | INFO | faiss.loader | Loading faiss with AVX2 support.\n", | ||||
|       "2024-03-31 05:59:32 | INFO | faiss.loader | Successfully loaded faiss with AVX2 support.\n" | ||||
|      "ename": "ImportError", | ||||
|      "evalue": "attempted relative import with no known parent package", | ||||
|      "output_type": "error", | ||||
|      "traceback": [ | ||||
|       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||||
|       "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)", | ||||
|       "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmain\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m song_cover_pipeline\n", | ||||
|       "File \u001b[0;32m~/MikuAI/rvc/main.py:9\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msox\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;66;03m#from pedalboard import Pedalboard, Reverb, Compressor, HighpassFilter\u001b[39;00m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;66;03m#from pedalboard.io import AudioFile\u001b[39;00m\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrvc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Config, load_hubert, get_vc, rvc_infer\n\u001b[1;32m     11\u001b[0m BASE_DIR \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mdirname(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(\u001b[38;5;18m__file__\u001b[39m))\n\u001b[1;32m     13\u001b[0m rvc_models_dir \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(BASE_DIR, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrvc_models\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", | ||||
|       "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" | ||||
|      ] | ||||
|     } | ||||
|    ], | ||||
| @ -40,45 +43,35 @@ | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 2, | ||||
|    "execution_count": 6, | ||||
|    "metadata": {}, | ||||
|    "outputs": [], | ||||
|    "source": [ | ||||
|     "import edge_tts\n", | ||||
|     "voice = 'en-US-AriaNeural'\n", | ||||
|     "\n", | ||||
|     "async def tts():\n", | ||||
|     "    communicate = edge_tts.Communicate('vinnybobinny', voice)\n", | ||||
|     "    await communicate.save('../audio/input.ogg')\n", | ||||
|     "\n", | ||||
|     "await tts()" | ||||
|    ] | ||||
|   }, | ||||
|   { | ||||
|    "cell_type": "code", | ||||
|    "execution_count": 7, | ||||
|    "metadata": {}, | ||||
|    "outputs": [ | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "Output file path will be: ../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav\n", | ||||
|       "[~] Converting voice using RVC...\n" | ||||
|      "ename": "NameError", | ||||
|      "evalue": "name 'song_cover_pipeline' is not defined", | ||||
|      "output_type": "error", | ||||
|      "traceback": [ | ||||
|       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||||
|       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)", | ||||
|       "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msong_cover_pipeline\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../audio/input.ogg\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n", | ||||
|       "\u001b[0;31mNameError\u001b[0m: name 'song_cover_pipeline' is not defined" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "name": "stderr", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | current directory is /home/jshiffer/rvc\n", | ||||
|       "2024-03-31 05:59:45 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': 'metadata', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}\n", | ||||
|       "2024-03-31 05:59:45 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}\n", | ||||
|       "/opt/conda/envs/rvc/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:28: UserWarning: torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\n", | ||||
|       "  warnings.warn(\"torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.\")\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "name": "stdout", | ||||
|      "output_type": "stream", | ||||
|      "text": [ | ||||
|       "gin_channels: 256 self.spk_embed_dim: 109\n", | ||||
|       "<All keys matched successfully>\n" | ||||
|      ] | ||||
|     }, | ||||
|     { | ||||
|      "data": { | ||||
|       "text/plain": [ | ||||
|        "'../audio/input_miku_p12_i0.5_fr3_rms0.25_pro0.33_rmvpe.wav'" | ||||
|       ] | ||||
|      }, | ||||
|      "execution_count": 2, | ||||
|      "metadata": {}, | ||||
|      "output_type": "execute_result" | ||||
|     } | ||||
|    ], | ||||
|    "source": [ | ||||
| @ -102,7 +95,7 @@ | ||||
|    "name": "python", | ||||
|    "nbconvert_exporter": "python", | ||||
|    "pygments_lexer": "ipython3", | ||||
|    "version": "3.10.14" | ||||
|    "version": "3.10.9" | ||||
|   } | ||||
|  }, | ||||
|  "nbformat": 4, | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user