audio.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. import librosa
  2. import librosa.filters
  3. import numpy as np
  4. # import tensorflow as tf
  5. from scipy import signal
  6. from scipy.io import wavfile
  7. from .hparams import hparams as hp
  8. def load_wav(path, sr):
  9. return librosa.core.load(path, sr=sr)[0]
  10. def save_wav(wav, path, sr):
  11. wav *= 32767 / max(0.01, np.max(np.abs(wav)))
  12. #proposed by @dsmiller
  13. wavfile.write(path, sr, wav.astype(np.int16))
  14. def save_wavenet_wav(wav, path, sr):
  15. librosa.output.write_wav(path, wav, sr=sr)
  16. def preemphasis(wav, k, preemphasize=True):
  17. if preemphasize:
  18. return signal.lfilter([1, -k], [1], wav)
  19. return wav
  20. def inv_preemphasis(wav, k, inv_preemphasize=True):
  21. if inv_preemphasize:
  22. return signal.lfilter([1], [1, -k], wav)
  23. return wav
  24. def get_hop_size():
  25. hop_size = hp.hop_size
  26. if hop_size is None:
  27. assert hp.frame_shift_ms is not None
  28. hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
  29. return hop_size
  30. def linearspectrogram(wav):
  31. D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
  32. S = _amp_to_db(np.abs(D)) - hp.ref_level_db
  33. if hp.signal_normalization:
  34. return _normalize(S)
  35. return S
  36. def melspectrogram(wav):
  37. D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
  38. S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
  39. if hp.signal_normalization:
  40. return _normalize(S)
  41. return S
  42. def _lws_processor():
  43. import lws
  44. return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
  45. def _stft(y):
  46. if hp.use_lws:
  47. return _lws_processor(hp).stft(y).T
  48. else:
  49. return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
  50. ##########################################################
  51. #Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
  52. def num_frames(length, fsize, fshift):
  53. """Compute number of time frames of spectrogram
  54. """
  55. pad = (fsize - fshift)
  56. if length % fshift == 0:
  57. M = (length + pad * 2 - fsize) // fshift + 1
  58. else:
  59. M = (length + pad * 2 - fsize) // fshift + 2
  60. return M
  61. def pad_lr(x, fsize, fshift):
  62. """Compute left and right padding
  63. """
  64. M = num_frames(len(x), fsize, fshift)
  65. pad = (fsize - fshift)
  66. T = len(x) + 2 * pad
  67. r = (M - 1) * fshift + fsize - T
  68. return pad, pad + r
  69. ##########################################################
  70. #Librosa correct padding
  71. def librosa_pad_lr(x, fsize, fshift):
  72. return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
  73. # Conversions
  74. _mel_basis = None
  75. def _linear_to_mel(spectogram):
  76. global _mel_basis
  77. if _mel_basis is None:
  78. _mel_basis = _build_mel_basis()
  79. return np.dot(_mel_basis, spectogram)
  80. def _build_mel_basis():
  81. assert hp.fmax <= hp.sample_rate // 2
  82. return librosa.filters.mel(sr=float(hp.sample_rate), n_fft=hp.n_fft, n_mels=hp.num_mels,
  83. fmin=hp.fmin, fmax=hp.fmax)
  84. def _amp_to_db(x):
  85. min_level = np.exp(hp.min_level_db / 20 * np.log(10))
  86. return 20 * np.log10(np.maximum(min_level, x))
  87. def _db_to_amp(x):
  88. return np.power(10.0, (x) * 0.05)
  89. def _normalize(S):
  90. if hp.allow_clipping_in_normalization:
  91. if hp.symmetric_mels:
  92. return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
  93. -hp.max_abs_value, hp.max_abs_value)
  94. else:
  95. return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
  96. assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
  97. if hp.symmetric_mels:
  98. return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
  99. else:
  100. return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
  101. def _denormalize(D):
  102. if hp.allow_clipping_in_normalization:
  103. if hp.symmetric_mels:
  104. return (((np.clip(D, -hp.max_abs_value,
  105. hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
  106. + hp.min_level_db)
  107. else:
  108. return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
  109. if hp.symmetric_mels:
  110. return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
  111. else:
  112. return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)