Audio processing with Python
Introduction
The weather was bad today. So I stayed at home and decided to learn something new. I remember to study digital filters in the faculty and it was somehow boring. At the time it was not so easy to create useful and practical examples.
Fortunately nowadays Python exists and it's really easy to play with sound processing as can be seen on this page.
Most of the code found on this page was created using snippets found on the internet.
Processing wave files and plotting spectrograms
import numpy as np
import matplotlib.pyplot as plt
import wave
from scipy.io import wavfile
import contextlib
# from http://stackoverflow.com/questions/2226853/interpreting-wav-data/2227174#2227174
def interpret_wav(raw_bytes, n_frames, n_channels, sample_width, interleaved = True):
if sample_width == 1:
dtype = np.uint8 # unsigned char
elif sample_width == 2:
dtype = np.int16 # signed 2-byte short
else:
raise ValueError("Only supports 8 and 16 bit audio formats.")
channels = np.frombuffer(raw_bytes, dtype=dtype)
if interleaved:
# channels are interleaved, i.e. sample N of channel M follows sample N of channel M-1 in raw data
channels.shape = (n_frames, n_channels)
channels = channels.T
else:
# channels are not interleaved. All samples from channel M occur before all samples from channel M-1
channels.shape = (n_channels, n_frames)
return channels
def get_start_end_frames(nFrames, sampleRate, tStart=None, tEnd=None):
if tStart and tStart*sampleRate<nFrames:
start = tStart*sampleRate
else:
start = 0
if tEnd and tEnd*sampleRate<nFrames and tEnd*sampleRate>start:
end = tEnd*sampleRate
else:
end = nFrames
return (start,end,end-start)
def extract_audio(fname, tStart=None, tEnd=None):
with contextlib.closing(wave.open(fname,'rb')) as spf:
sampleRate = spf.getframerate()
ampWidth = spf.getsampwidth()
nChannels = spf.getnchannels()
nFrames = spf.getnframes()
startFrame, endFrame, segFrames = get_start_end_frames(nFrames, sampleRate, tStart, tEnd)
# Extract Raw Audio from multi-channel Wav File
spf.setpos(startFrame)
sig = spf.readframes(segFrames)
spf.close()
channels = interpret_wav(sig, segFrames, nChannels, ampWidth, True)
return (channels, nChannels, sampleRate, ampWidth, nFrames)
def convert_to_mono(channels, nChannels, outputType):
if nChannels == 2:
samples = np.mean(np.array([channels[0], channels[1]]), axis=0) # Convert to mono
else:
samples = channels[0]
return samples.astype(outputType)
def plot_specgram(samples, sampleRate, tStart=None, tEnd=None):
plt.figure(figsize=(20,10))
plt.specgram(samples, Fs=sampleRate, NFFT=1024, noverlap=192, cmap='nipy_spectral', xextent=(tStart,tEnd))
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()
def plot_audio_samples(title, samples, sampleRate, tStart=None, tEnd=None):
if not tStart:
tStart = 0
if not tEnd or tStart>tEnd:
tEnd = len(samples)/sampleRate
f, axarr = plt.subplots(2, sharex=True, figsize=(20,10))
axarr[0].set_title(title)
axarr[0].plot(np.linspace(tStart, tEnd, len(samples)), samples)
axarr[1].specgram(samples, Fs=sampleRate, NFFT=1024, noverlap=192, cmap='nipy_spectral', xextent=(tStart,tEnd))
#get_specgram(axarr[1], samples, sampleRate, tStart, tEnd)
axarr[0].set_ylabel('Amplitude')
axarr[1].set_ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()
tStart=0
tEnd=20
channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('sultans.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)
plot_audio_samples("Sultans of Swing - First 20s", samples, sampleRate, tStart, tEnd)
wavfile.write('sultans_20s.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i sultans_20s.wav sultans_20s.mp3
Processed audio:
Spectograms: Example 2
tStart=0
tEnd=20
channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('about.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)
plot_audio_samples("About a Girl - First 20s", samples, sampleRate, tStart, tEnd)
wavfile.write('about_20s.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i about_20s.wav about_20s.mp3
Processed audio:
Filtering the whistle in "About a Girl" intro
Now lets use a digital filter to extract the whistle between the 13 and 15s in the "About a Girl" intro. In this case a band pass FIR is used. The low pass, high pass and band reject are also implemented as they is be used later.
def fir_high_pass(samples, fs, fH, N, outputType):
# Referece: https://fiiir.com
fH = fH / fs
# Compute sinc filter.
h = np.sinc(2 * fH * (np.arange(N) - (N - 1) / 2.))
# Apply window.
h *= np.hamming(N)
# Normalize to get unity gain.
h /= np.sum(h)
# Create a high-pass filter from the low-pass filter through spectral inversion.
h = -h
h[int((N - 1) / 2)] += 1
# Applying the filter to a signal s can be as simple as writing
s = np.convolve(samples, h).astype(outputType)
return s
def fir_low_pass(samples, fs, fL, N, outputType):
# Referece: https://fiiir.com
fL = fL / fs
# Compute sinc filter.
h = np.sinc(2 * fL * (np.arange(N) - (N - 1) / 2.))
# Apply window.
h *= np.hamming(N)
# Normalize to get unity gain.
h /= np.sum(h)
# Applying the filter to a signal s can be as simple as writing
s = np.convolve(samples, h).astype(outputType)
return s
def fir_band_reject(samples, fs, fL, fH, NL, NH, outputType):
# Referece: https://fiiir.com
fH = fH / fs
fL = fL / fs
# Compute a low-pass filter with cutoff frequency fL.
hlpf = np.sinc(2 * fL * (np.arange(NL) - (NL - 1) / 2.))
hlpf *= np.blackman(NL)
hlpf /= np.sum(hlpf)
# Compute a high-pass filter with cutoff frequency fH.
hhpf = np.sinc(2 * fH * (np.arange(NH) - (NH - 1) / 2.))
hhpf *= np.blackman(NH)
hhpf /= np.sum(hhpf)
hhpf = -hhpf
hhpf[int((NH - 1) / 2)] += 1
# Add both filters.
if NH >= NL:
h = hhpf
h[int((NH - NL) / 2) : int((NH - NL) / 2 + NL)] += hlpf
else:
h = hlpf
h[int((NL - NH) / 2) : int((NL - NH) / 2 + NH)] += hhpf
# Applying the filter to a signal s can be as simple as writing
s = np.convolve(samples, h).astype(outputType)
return s
def fir_band_pass(samples, fs, fL, fH, NL, NH, outputType):
# Referece: https://fiiir.com
fH = fH / fs
fL = fL / fs
# Compute a low-pass filter with cutoff frequency fH.
hlpf = np.sinc(2 * fH * (np.arange(NH) - (NH - 1) / 2.))
hlpf *= np.blackman(NH)
hlpf /= np.sum(hlpf)
# Compute a high-pass filter with cutoff frequency fL.
hhpf = np.sinc(2 * fL * (np.arange(NL) - (NL - 1) / 2.))
hhpf *= np.blackman(NL)
hhpf /= np.sum(hhpf)
hhpf = -hhpf
hhpf[int((NL - 1) / 2)] += 1
# Convolve both filters.
h = np.convolve(hlpf, hhpf)
# Applying the filter to a signal s can be as simple as writing
s = np.convolve(samples, h).astype(outputType)
return s
tStart = 12
tEnd = 15
channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('about.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)
plot_audio_samples("About a Girl section - Before Filtering", samples, sampleRate, tStart, tEnd)
wavfile.write('about_original.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i about_original.wav about_original.mp3
Before filtering:
It is possible to see the whistle in the spectrogram. There are three components in the following bands:
- 2400 to 2900Hz
- Around 5000Hz
- Around 7500Hz
The predominant sound thought is contained in the first band, and thats the one we will try to filter.
samples_filtered = fir_band_pass(samples, sampleRate, 2400, 2900, 461, 461, np.int16)
samples_filtered = samples_filtered * 2 # Sound amplification
plot_audio_samples("About a Girl section - After Filtering", samples_filtered, sampleRate, tStart, tEnd)
wavfile.write('about_whistle.wav', sampleRate, samples_filtered)
!ffmpeg -y -loglevel panic -i about_whistle.wav about_whistle.mp3
After filtering:
The result is not perfect. But it's possible to get the point.
Removing voice from song: attempt 1
tStart = 0
tEnd = 20
channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('sultans.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)
plot_audio_samples("Sultans of Swing - Before Filtering", samples, sampleRate, tStart, tEnd)
wavfile.write('sultans_original.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i sultans_original.wav sultans_original.mp3
Before filtering:
On this attempt, very sharp passband filter is used to remove the frequencies associated with the voice. After some tuning, the cutoff frequencies were selected to be around 300Hz for the low pass filter and 6660Hz for the high pass filter. Two passes was used in this case.
lp_samples_filtered = fir_low_pass(samples, sampleRate, 300, 461, np.int16) # First pass
lp_samples_filtered = fir_low_pass(lp_samples_filtered, sampleRate, 250, 461, np.int16) # Second pass
hp_samples_filtered = fir_high_pass(samples, sampleRate, 6600, 461, np.int16) # First pass
hp_samples_filtered = fir_high_pass(hp_samples_filtered, sampleRate, 6600, 461, np.int16) # Second pass
samples_filtered = np.mean(np.array([lp_samples_filtered, hp_samples_filtered]), axis=0).astype(np.int16)
plot_audio_samples("Sultans of Swing - After Filtering 1", samples_filtered, sampleRate, tStart, tEnd)
wavfile.write('sultans_novoice1.wav', sampleRate, samples_filtered)
!ffmpeg -y -loglevel panic -i sultans_novoice1.wav sultans_novoice1.mp3
After filtering:
The resulting sound does not sound very natural. But the voice was filtered!
Removing voice from song: attempt 2
Apparently, a widely used technique to remove voice from songs is to mix both channels (left and right) together. Since the voice is very similar in both channels, when subtracting them, the voice will cancel.
channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('sultans.wav', tStart, tEnd)
samples_no_voice = (channels[0]-channels[1]).astype(np.int16)
plot_audio_samples("Sultans of Swing - After Filtering 2", samples_no_voice, sampleRate, tStart, tEnd)
wavfile.write('sultans_novoice2.wav', sampleRate, samples_no_voice)
!ffmpeg -y -loglevel panic -i sultans_novoice2.wav sultans_novoice2.mp3
After filtering:
I really like the result because there's a lot of reverb and echo.
Removing voice from song: mixing attempts
In the third attempt, both attempts #1 and #2 are mixed together.
lp_samples_filtered.resize(samples_no_voice.shape)
hp_samples_filtered.resize(samples_no_voice.shape)
samples = ((samples_no_voice+lp_samples_filtered+hp_samples_filtered)/3).astype(np.int16)
plot_audio_samples("Sultans of Swing - After Filtering 1+2", samples_no_voice, sampleRate, tStart, tEnd)
wavfile.write('sultans_novoice3.wav', sampleRate, samples_no_voice)
!ffmpeg -y -loglevel panic -i sultans_novoice3.wav sultans_novoice3.mp3
After filtering:
Seems almost the same as attempt #2.
Made with Jupyter Notebooks.
Software | Version |
---|---|
Python | 3.6.4 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] |
IPython | 6.2.1 |
OS | Darwin 17.4.0 x86_64 i386 64bit |
scipy | 1.0.0 |
numpy | 1.14.0 |
matplotlib | 2.1.2 |
version_information | 1.0.3 |
Sun Mar 11 22:05:48 2018 CET |