hubertasr.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. import time
  2. import torch
  3. import numpy as np
  4. from baseasr import BaseASR
  5. from ultralight.audio2feature import Audio2Feature
  6. # hubert audio feature
  7. class HubertASR(BaseASR):
  8. #audio_feat_length: select audio feature before and after
  9. def __init__(self, opt, parent, audio_processor:Audio2Feature,audio_feat_length = [8,8]):
  10. super().__init__(opt, parent)
  11. self.audio_processor = audio_processor
  12. #self.stride_left_size = 32
  13. #self.stride_right_size = 32
  14. self.audio_feat_length = audio_feat_length
  15. def run_step(self):
  16. start_time = time.time()
  17. for _ in range(self.batch_size * 2):
  18. audio_frame, type,eventpoint = self.get_audio_frame()
  19. self.frames.append(audio_frame)
  20. self.output_queue.put((audio_frame, type,eventpoint))
  21. if len(self.frames) <= self.stride_left_size + self.stride_right_size:
  22. return
  23. inputs = np.concatenate(self.frames) # [N * chunk]
  24. mel = self.audio_processor.get_hubert_from_16k_speech(inputs)
  25. mel_chunks=self.audio_processor.feature2chunks(feature_array=mel,fps=self.fps/2,batch_size=self.batch_size,audio_feat_length = self.audio_feat_length, start=self.stride_left_size/2)
  26. self.feat_queue.put(mel_chunks)
  27. self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
  28. #print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")