audio_spectrogram.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804
  1. import numpy as np
  2. import cv2 as cv
  3. import math
  4. import argparse
  5. class AudioDrawing:
  6. '''
  7. Used for drawing audio graphics
  8. '''
  9. def __init__(self, args):
  10. self.inputType = args.inputType
  11. self.draw = args.draw
  12. self.graph = args.graph
  13. self.audio = cv.samples.findFile(args.audio)
  14. self.audioStream = args.audioStream
  15. self.windowType = args.windowType
  16. self.windLen = args.windLen
  17. self.overlap = args.overlap
  18. self.enableGrid = args.enableGrid
  19. self.rows = args.rows
  20. self.cols = args.cols
  21. self.xmarkup = args.xmarkup
  22. self.ymarkup = args.ymarkup
  23. self.zmarkup = args.zmarkup
  24. self.microTime = args.microTime
  25. self.frameSizeTime = args.frameSizeTime
  26. self.updateTime = args.updateTime
  27. self.waitTime = args.waitTime
  28. if self.initAndCheckArgs(args) is False:
  29. exit()
  30. def Draw(self):
  31. if self.draw == "static":
  32. if self.inputType == "file":
  33. samplingRate, inputAudio = self.readAudioFile(self.audio)
  34. elif self.inputType == "microphone":
  35. samplingRate, inputAudio = self.readAudioMicrophone()
  36. duration = len(inputAudio) // samplingRate
  37. # since the dimensional grid is counted in integer seconds,
  38. # if the input audio has an incomplete last second,
  39. # then it is filled with zeros to complete
  40. remainder = len(inputAudio) % samplingRate
  41. if remainder != 0:
  42. sizeToFullSec = samplingRate - remainder
  43. zeroArr = np.zeros(sizeToFullSec)
  44. inputAudio = np.concatenate((inputAudio, zeroArr), axis=0)
  45. duration += 1
  46. print("Update duration of audio to full second with ",
  47. sizeToFullSec, " zero samples")
  48. print("New number of samples ", len(inputAudio))
  49. if duration <= self.xmarkup:
  50. self.xmarkup = duration + 1
  51. if self.graph == "ampl":
  52. imgAmplitude = self.drawAmplitude(inputAudio)
  53. imgAmplitude = self.drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate)
  54. cv.imshow("Display window", imgAmplitude)
  55. cv.waitKey(0)
  56. elif self.graph == "spec":
  57. stft = self.STFT(inputAudio)
  58. imgSpec = self.drawSpectrogram(stft)
  59. imgSpec = self.drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft)
  60. cv.imshow("Display window", imgSpec)
  61. cv.waitKey(0)
  62. elif self.graph == "ampl_and_spec":
  63. imgAmplitude = self.drawAmplitude(inputAudio)
  64. imgAmplitude = self.drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate)
  65. stft = self.STFT(inputAudio)
  66. imgSpec = self.drawSpectrogram(stft)
  67. imgSpec = self.drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft)
  68. imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
  69. cv.imshow("Display window", imgTotal)
  70. cv.waitKey(0)
  71. elif self.draw == "dynamic":
  72. if self.inputType == "file":
  73. self.dynamicFile(self.audio)
  74. elif self.inputType == "microphone":
  75. self.dynamicMicrophone()
  76. def readAudioFile(self, file):
  77. cap = cv.VideoCapture(file)
  78. params = [cv.CAP_PROP_AUDIO_STREAM, self.audioStream,
  79. cv.CAP_PROP_VIDEO_STREAM, -1,
  80. cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_16S]
  81. params = np.asarray(params)
  82. cap.open(file, cv.CAP_ANY, params)
  83. if cap.isOpened() == False:
  84. print("Error : Can't read audio file: '", self.audio, "' with audioStream = ", self.audioStream)
  85. print("Error: problems with audio reading, check input arguments")
  86. exit()
  87. audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
  88. numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
  89. print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
  90. print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  91. print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
  92. print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
  93. frame = []
  94. frame = np.asarray(frame)
  95. inputAudio = []
  96. while (1):
  97. if (cap.grab()):
  98. frame = []
  99. frame = np.asarray(frame)
  100. frame = cap.retrieve(frame, audioBaseIndex)
  101. for i in range(len(frame[1][0])):
  102. inputAudio.append(frame[1][0][i])
  103. else:
  104. break
  105. inputAudio = np.asarray(inputAudio)
  106. print("Number of samples: ", len(inputAudio))
  107. samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  108. return samplingRate, inputAudio
  109. def readAudioMicrophone(self):
  110. cap = cv.VideoCapture()
  111. params = [cv.CAP_PROP_AUDIO_STREAM, 0, cv.CAP_PROP_VIDEO_STREAM, -1]
  112. params = np.asarray(params)
  113. cap.open(0, cv.CAP_ANY, params)
  114. if cap.isOpened() == False:
  115. print("Error: Can't open microphone")
  116. print("Error: problems with audio reading, check input arguments")
  117. exit()
  118. audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
  119. numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
  120. print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
  121. print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  122. print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
  123. print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
  124. cvTickFreq = cv.getTickFrequency()
  125. sysTimeCurr = cv.getTickCount()
  126. sysTimePrev = sysTimeCurr
  127. frame = []
  128. frame = np.asarray(frame)
  129. inputAudio = []
  130. while ((sysTimeCurr - sysTimePrev) / cvTickFreq < self.microTime):
  131. if (cap.grab()):
  132. frame = []
  133. frame = np.asarray(frame)
  134. frame = cap.retrieve(frame, audioBaseIndex)
  135. for i in range(len(frame[1][0])):
  136. inputAudio.append(frame[1][0][i])
  137. sysTimeCurr = cv.getTickCount()
  138. else:
  139. print("Error: Grab error")
  140. break
  141. inputAudio = np.asarray(inputAudio)
  142. print("Number of samples: ", len(inputAudio))
  143. samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  144. return samplingRate, inputAudio
  145. def drawAmplitude(self, inputAudio):
  146. color = (247, 111, 87)
  147. thickness = 5
  148. frameVectorRows = 500
  149. middle = frameVectorRows // 2
  150. # usually the input data is too big, so it is necessary
  151. # to reduce size using interpolation of data
  152. frameVectorCols = 40000
  153. if len(inputAudio) < frameVectorCols:
  154. frameVectorCols = len(inputAudio)
  155. img = np.zeros((frameVectorRows, frameVectorCols, 3), np.uint8)
  156. img += 255 # white background
  157. audio = np.array(0)
  158. audio = cv.resize(inputAudio, (1, frameVectorCols), interpolation=cv.INTER_LINEAR)
  159. reshapeAudio = np.reshape(audio, (-1))
  160. # normalization data by maximum element
  161. minCv, maxCv, _, _ = cv.minMaxLoc(reshapeAudio)
  162. maxElem = int(max(abs(minCv), abs(maxCv)))
  163. # if all data values are zero (silence)
  164. if maxElem == 0:
  165. maxElem = 1
  166. for i in range(len(reshapeAudio)):
  167. reshapeAudio[i] = middle - reshapeAudio[i] * middle // maxElem
  168. for i in range(1, frameVectorCols, 1):
  169. cv.line(img, (i - 1, int(reshapeAudio[i - 1])), (i, int(reshapeAudio[i])), color, thickness)
  170. img = cv.resize(img, (900, 400), interpolation=cv.INTER_AREA)
  171. return img
  172. def drawAmplitudeScale(self, inputImg, inputAudio, samplingRate, xmin=None, xmax=None):
  173. # function of layout drawing for graph of volume amplitudes
  174. # x axis for time
  175. # y axis for amplitudes
  176. # parameters for the new image size
  177. preCol = 100
  178. aftCol = 100
  179. preLine = 40
  180. aftLine = 50
  181. frameVectorRows = inputImg.shape[0]
  182. frameVectorCols = inputImg.shape[1]
  183. totalRows = preLine + frameVectorRows + aftLine
  184. totalCols = preCol + frameVectorCols + aftCol
  185. imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
  186. imgTotal += 255 # white background
  187. imgTotal[preLine: preLine + frameVectorRows, preCol: preCol + frameVectorCols] = inputImg
  188. # calculating values on x axis
  189. if xmin is None:
  190. xmin = 0
  191. if xmax is None:
  192. xmax = len(inputAudio) / samplingRate
  193. if xmax > self.xmarkup:
  194. xList = np.linspace(xmin, xmax, self.xmarkup).astype(int)
  195. else:
  196. # this case is used to display a dynamic update
  197. tmp = np.arange(xmin, xmax, 1).astype(int) + 1
  198. xList = np.concatenate((np.zeros(self.xmarkup - len(tmp)), tmp[:]), axis=None)
  199. # calculating values on y axis
  200. ymin = np.min(inputAudio)
  201. ymax = np.max(inputAudio)
  202. yList = np.linspace(ymin, ymax, self.ymarkup)
  203. # parameters for layout drawing
  204. textThickness = 1
  205. gridThickness = 1
  206. gridColor = (0, 0, 0)
  207. textColor = (0, 0, 0)
  208. font = cv.FONT_HERSHEY_SIMPLEX
  209. fontScale = 0.5
  210. # horizontal axis under the graph
  211. cv.line(imgTotal, (preCol, totalRows - aftLine),
  212. (preCol + frameVectorCols, totalRows - aftLine),
  213. gridColor, gridThickness)
  214. # vertical axis for amplitude
  215. cv.line(imgTotal, (preCol, preLine), (preCol, preLine + frameVectorRows),
  216. gridColor, gridThickness)
  217. # parameters for layout calculation
  218. serifSize = 10
  219. indentDownX = serifSize * 2
  220. indentDownY = serifSize // 2
  221. indentLeftX = serifSize
  222. indentLeftY = 2 * preCol // 3
  223. # drawing layout for x axis
  224. numX = frameVectorCols // (self.xmarkup - 1)
  225. for i in range(len(xList)):
  226. a1 = preCol + i * numX
  227. a2 = frameVectorRows + preLine
  228. b1 = a1
  229. b2 = a2 + serifSize
  230. if self.enableGrid is True:
  231. d1 = a1
  232. d2 = preLine
  233. cv.line(imgTotal, (a1, a2), (d1, d2), gridColor, gridThickness)
  234. cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
  235. cv.putText(imgTotal, str(int(xList[i])), (b1 - indentLeftX, b2 + indentDownX),
  236. font, fontScale, textColor, textThickness)
  237. # drawing layout for y axis
  238. numY = frameVectorRows // (self.ymarkup - 1)
  239. for i in range(len(yList)):
  240. a1 = preCol
  241. a2 = totalRows - aftLine - i * numY
  242. b1 = preCol - serifSize
  243. b2 = a2
  244. if self.enableGrid is True:
  245. d1 = preCol + frameVectorCols
  246. d2 = a2
  247. cv.line(imgTotal, (a1, a2), (d1, d2), gridColor, gridThickness)
  248. cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
  249. cv.putText(imgTotal, str(int(yList[i])), (b1 - indentLeftY, b2 + indentDownY),
  250. font, fontScale, textColor, textThickness)
  251. imgTotal = cv.resize(imgTotal, (self.cols, self.rows), interpolation=cv.INTER_AREA)
  252. return imgTotal
  253. def STFT(self, inputAudio):
  254. """
  255. The Short-time Fourier transform (STFT), is a Fourier-related transform used to determine
  256. the sinusoidal frequency and phase content of local sections of a signal as it changes over
  257. time.
  258. In practice, the procedure for computing STFTs is to divide a longer time signal into
  259. shorter segments of equal length and then compute the Fourier transform separately on each
  260. shorter segment. This reveals the Fourier spectrum on each shorter segment. One then usually
  261. plots the changing spectra as a function of time, known as a spectrogram or waterfall plot.
  262. https://en.wikipedia.org/wiki/Short-time_Fourier_transform
  263. """
  264. time_step = self.windLen - self.overlap
  265. stft = []
  266. if self.windowType == "Hann":
  267. # https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
  268. Hann_wind = []
  269. for i in range (1 - self.windLen, self.windLen, 2):
  270. Hann_wind.append(i * (0.5 + 0.5 * math.cos(math.pi * i / (self.windLen - 1))))
  271. Hann_wind = np.asarray(Hann_wind)
  272. elif self.windowType == "Hamming":
  273. # https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
  274. Hamming_wind = []
  275. for i in range (1 - self.windLen, self.windLen, 2):
  276. Hamming_wind.append(i * (0.53836 - 0.46164 * (math.cos(2 * math.pi * i / (self.windLen - 1)))))
  277. Hamming_wind = np.asarray(Hamming_wind)
  278. for index in np.arange(0, len(inputAudio), time_step).astype(int):
  279. section = inputAudio[index:index + self.windLen]
  280. zeroArray = np.zeros(self.windLen - len(section))
  281. section = np.concatenate((section, zeroArray), axis=None)
  282. if self.windowType == "Hann":
  283. section *= Hann_wind
  284. elif self.windowType == "Hamming":
  285. section *= Hamming_wind
  286. dst = np.empty(0)
  287. dst = cv.dft(section, dst, flags=cv.DFT_COMPLEX_OUTPUT)
  288. reshape_dst = np.reshape(dst, (-1))
  289. # we need only the first part of the spectrum, the second part is symmetrical
  290. complexArr = np.zeros(len(dst) // 4, dtype=complex)
  291. for i in range(len(dst) // 4):
  292. complexArr[i] = complex(reshape_dst[2 * i], reshape_dst[2 * i + 1])
  293. stft.append(np.abs(complexArr))
  294. stft = np.array(stft).transpose()
  295. # convert elements to the decibel scale
  296. np.log10(stft, out=stft, where=(stft != 0.))
  297. return 10 * stft
  298. def drawSpectrogram(self, stft):
  299. frameVectorRows = stft.shape[0]
  300. frameVectorCols = stft.shape[1]
  301. # Normalization of image values from 0 to 255 to get more contrast image
  302. # and this normalization will be taken into account in the scale drawing
  303. colormapImageRows = 255
  304. imgSpec = np.zeros((frameVectorRows, frameVectorCols, 3), np.uint8)
  305. stftMat = np.zeros((frameVectorRows, frameVectorCols), np.float64)
  306. cv.normalize(stft, stftMat, 1.0, 0.0, cv.NORM_INF)
  307. for i in range(frameVectorRows):
  308. for j in range(frameVectorCols):
  309. imgSpec[frameVectorRows - i - 1, j] = int(stftMat[i][j] * colormapImageRows)
  310. imgSpec = cv.applyColorMap(imgSpec, cv.COLORMAP_INFERNO)
  311. imgSpec = cv.resize(imgSpec, (900, 400), interpolation=cv.INTER_LINEAR)
  312. return imgSpec
  313. def drawSpectrogramColorbar(self, inputImg, inputAudio, samplingRate, stft, xmin=None, xmax=None):
  314. # function of layout drawing for the three-dimensional graph of the spectrogram
  315. # x axis for time
  316. # y axis for frequencies
  317. # z axis for magnitudes of frequencies shown by color scale
  318. # parameters for the new image size
  319. preCol = 100
  320. aftCol = 100
  321. preLine = 40
  322. aftLine = 50
  323. colColor = 20
  324. ind_col = 20
  325. frameVectorRows = inputImg.shape[0]
  326. frameVectorCols = inputImg.shape[1]
  327. totalRows = preLine + frameVectorRows + aftLine
  328. totalCols = preCol + frameVectorCols + aftCol + colColor
  329. imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
  330. imgTotal += 255 # white background
  331. imgTotal[preLine: preLine + frameVectorRows, preCol: preCol + frameVectorCols] = inputImg
  332. # colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
  333. # so here colorbar has values from 255 to 0
  334. colorArrSize = 256
  335. imgColorBar = np.zeros((colorArrSize, colColor, 1), np.uint8)
  336. for i in range(colorArrSize):
  337. imgColorBar[i] += colorArrSize - 1 - i
  338. imgColorBar = cv.applyColorMap(imgColorBar, cv.COLORMAP_INFERNO)
  339. imgColorBar = cv.resize(imgColorBar, (colColor, frameVectorRows), interpolation=cv.INTER_AREA) #
  340. imgTotal[preLine: preLine + frameVectorRows,
  341. preCol + frameVectorCols + ind_col:
  342. preCol + frameVectorCols + ind_col + colColor] = imgColorBar
  343. # calculating values on x axis
  344. if xmin is None:
  345. xmin = 0
  346. if xmax is None:
  347. xmax = len(inputAudio) / samplingRate
  348. if xmax > self.xmarkup:
  349. xList = np.linspace(xmin, xmax, self.xmarkup).astype(int)
  350. else:
  351. # this case is used to display a dynamic update
  352. tmpXList = np.arange(xmin, xmax, 1).astype(int) + 1
  353. xList = np.concatenate((np.zeros(self.xmarkup - len(tmpXList)), tmpXList[:]), axis=None)
  354. # calculating values on y axis
  355. # according to the Nyquist sampling theorem,
  356. # signal should posses frequencies equal to half of sampling rate
  357. ymin = 0
  358. ymax = int(samplingRate / 2.)
  359. yList = np.linspace(ymin, ymax, self.ymarkup).astype(int)
  360. # calculating values on z axis
  361. zList = np.linspace(np.min(stft), np.max(stft), self.zmarkup)
  362. # parameters for layout drawing
  363. textThickness = 1
  364. textColor = (0, 0, 0)
  365. gridThickness = 1
  366. gridColor = (0, 0, 0)
  367. font = cv.FONT_HERSHEY_SIMPLEX
  368. fontScale = 0.5
  369. serifSize = 10
  370. indentDownX = serifSize * 2
  371. indentDownY = serifSize // 2
  372. indentLeftX = serifSize
  373. indentLeftY = 2 * preCol // 3
  374. # horizontal axis
  375. cv.line(imgTotal, (preCol, totalRows - aftLine), (preCol + frameVectorCols, totalRows - aftLine),
  376. gridColor, gridThickness)
  377. # vertical axis
  378. cv.line(imgTotal, (preCol, preLine), (preCol, preLine + frameVectorRows),
  379. gridColor, gridThickness)
  380. # drawing layout for x axis
  381. numX = frameVectorCols // (self.xmarkup - 1)
  382. for i in range(len(xList)):
  383. a1 = preCol + i * numX
  384. a2 = frameVectorRows + preLine
  385. b1 = a1
  386. b2 = a2 + serifSize
  387. cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
  388. cv.putText(imgTotal, str(int(xList[i])), (b1 - indentLeftX, b2 + indentDownX),
  389. font, fontScale, textColor, textThickness)
  390. # drawing layout for y axis
  391. numY = frameVectorRows // (self.ymarkup - 1)
  392. for i in range(len(yList)):
  393. a1 = preCol
  394. a2 = totalRows - aftLine - i * numY
  395. b1 = preCol - serifSize
  396. b2 = a2
  397. cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
  398. cv.putText(imgTotal, str(int(yList[i])), (b1 - indentLeftY, b2 + indentDownY),
  399. font, fontScale, textColor, textThickness)
  400. # drawing layout for z axis
  401. numZ = frameVectorRows // (self.zmarkup - 1)
  402. for i in range(len(zList)):
  403. a1 = preCol + frameVectorCols + ind_col + colColor
  404. a2 = totalRows - aftLine - i * numZ
  405. b1 = a1 + serifSize
  406. b2 = a2
  407. cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
  408. cv.putText(imgTotal, str(int(zList[i])), (b1 + 10, b2 + indentDownY),
  409. font, fontScale, textColor, textThickness)
  410. imgTotal = cv.resize(imgTotal, (self.cols, self.rows), interpolation=cv.INTER_AREA)
  411. return imgTotal
  412. def concatenateImages(self, img1, img2):
  413. # first image will be under the second image
  414. totalRows = img1.shape[0] + img2.shape[0]
  415. totalCols = max(img1.shape[1], img2.shape[1])
  416. # if images columns do not match, the difference is filled in white
  417. imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
  418. imgTotal += 255
  419. imgTotal[:img1.shape[0], :img1.shape[1]] = img1
  420. imgTotal[img2.shape[0]:, :img2.shape[1]] = img2
  421. return imgTotal
  422. def dynamicFile(self, file):
  423. cap = cv.VideoCapture(file)
  424. params = [cv.CAP_PROP_AUDIO_STREAM, self.audioStream,
  425. cv.CAP_PROP_VIDEO_STREAM, -1,
  426. cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_16S]
  427. params = np.asarray(params)
  428. cap.open(file, cv.CAP_ANY, params)
  429. if cap.isOpened() == False:
  430. print("ERROR! Can't to open file")
  431. return
  432. audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
  433. numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
  434. samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  435. print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
  436. print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  437. print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
  438. print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
  439. step = int(self.updateTime * samplingRate)
  440. frameSize = int(self.frameSizeTime * samplingRate)
  441. # since the dimensional grid is counted in integer seconds,
  442. # if duration of audio frame is less than xmarkup, to avoid an incorrect display,
  443. # xmarkup will be taken equal to duration
  444. if self.frameSizeTime <= self.xmarkup:
  445. self.xmarkup = self.frameSizeTime
  446. buffer = []
  447. section = np.zeros(frameSize, dtype=np.int16)
  448. currentSamples = 0
  449. while (1):
  450. if (cap.grab()):
  451. frame = []
  452. frame = np.asarray(frame)
  453. frame = cap.retrieve(frame, audioBaseIndex)
  454. for i in range(len(frame[1][0])):
  455. buffer.append(frame[1][0][i])
  456. buffer_size = len(buffer)
  457. if (buffer_size >= step):
  458. section = list(section)
  459. currentSamples += step
  460. del section[0:step]
  461. section.extend(buffer[0:step])
  462. del buffer[0:step]
  463. section = np.asarray(section)
  464. if currentSamples < frameSize:
  465. xmin = 0
  466. xmax = (currentSamples) / samplingRate
  467. else:
  468. xmin = (currentSamples - frameSize) / samplingRate + 1
  469. xmax = (currentSamples) / samplingRate
  470. if self.graph == "ampl":
  471. imgAmplitude = self.drawAmplitude(section)
  472. imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
  473. cv.imshow("Display amplitude graph", imgAmplitude)
  474. cv.waitKey(self.waitTime)
  475. elif self.graph == "spec":
  476. stft = self.STFT(section)
  477. imgSpec = self.drawSpectrogram(stft)
  478. imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
  479. cv.imshow("Display spectrogram", imgSpec)
  480. cv.waitKey(self.waitTime)
  481. elif self.graph == "ampl_and_spec":
  482. imgAmplitude = self.drawAmplitude(section)
  483. stft = self.STFT(section)
  484. imgSpec = self.drawSpectrogram(stft)
  485. imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
  486. imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
  487. imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
  488. cv.imshow("Display amplitude graph and spectrogram", imgTotal)
  489. cv.waitKey(self.waitTime)
  490. else:
  491. break
  492. def dynamicMicrophone(self):
  493. cap = cv.VideoCapture()
  494. params = [cv.CAP_PROP_AUDIO_STREAM, 0, cv.CAP_PROP_VIDEO_STREAM, -1]
  495. params = np.asarray(params)
  496. cap.open(0, cv.CAP_ANY, params)
  497. if cap.isOpened() == False:
  498. print("ERROR! Can't to open file")
  499. return
  500. audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
  501. numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
  502. print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
  503. print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  504. print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
  505. print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
  506. frame = []
  507. frame = np.asarray(frame)
  508. samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
  509. step = int(self.updateTime * samplingRate)
  510. frameSize = int(self.frameSizeTime * samplingRate)
  511. self.xmarkup = self.frameSizeTime
  512. currentSamples = 0
  513. buffer = []
  514. section = np.zeros(frameSize, dtype=np.int16)
  515. cvTickFreq = cv.getTickFrequency()
  516. sysTimeCurr = cv.getTickCount()
  517. sysTimePrev = sysTimeCurr
  518. self.waitTime = self.updateTime * 1000
  519. while ((sysTimeCurr - sysTimePrev) / cvTickFreq < self.microTime):
  520. if (cap.grab()):
  521. frame = []
  522. frame = np.asarray(frame)
  523. frame = cap.retrieve(frame, audioBaseIndex)
  524. for i in range(len(frame[1][0])):
  525. buffer.append(frame[1][0][i])
  526. sysTimeCurr = cv.getTickCount()
  527. buffer_size = len(buffer)
  528. if (buffer_size >= step):
  529. section = list(section)
  530. currentSamples += step
  531. del section[0:step]
  532. section.extend(buffer[0:step])
  533. del buffer[0:step]
  534. section = np.asarray(section)
  535. if currentSamples < frameSize:
  536. xmin = 0
  537. xmax = (currentSamples) / samplingRate
  538. else:
  539. xmin = (currentSamples - frameSize) / samplingRate + 1
  540. xmax = (currentSamples) / samplingRate
  541. if self.graph == "ampl":
  542. imgAmplitude = self.drawAmplitude(section)
  543. imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
  544. cv.imshow("Display amplitude graph", imgAmplitude)
  545. cv.waitKey(self.waitTime)
  546. elif self.graph == "spec":
  547. stft = self.STFT(section)
  548. imgSpec = self.drawSpectrogram(stft)
  549. imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
  550. cv.imshow("Display spectrogram", imgSpec)
  551. cv.waitKey(self.waitTime)
  552. elif self.graph == "ampl_and_spec":
  553. imgAmplitude = self.drawAmplitude(section)
  554. stft = self.STFT(section)
  555. imgSpec = self.drawSpectrogram(stft)
  556. imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
  557. imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
  558. imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
  559. cv.imshow("Display amplitude graph and spectrogram", imgTotal)
  560. cv.waitKey(self.waitTime)
  561. else:
  562. break
  563. def initAndCheckArgs(self, args):
  564. if args.inputType != "file" and args.inputType != "microphone":
  565. print("Error: ", args.inputType, " input method doesnt exist")
  566. return False
  567. if args.draw != "static" and args.draw != "dynamic":
  568. print("Error: ", args.draw, " draw type doesnt exist")
  569. return False
  570. if args.graph != "ampl" and args.graph != "spec" and args.graph != "ampl_and_spec":
  571. print("Error: ", args.graph, " type of graph doesnt exist")
  572. return False
  573. if args.windowType != "Rect" and args.windowType != "Hann" and args.windowType != "Hamming":
  574. print("Error: ", args.windowType, " type of window doesnt exist")
  575. return False
  576. if args.windLen <= 0:
  577. print("Error: windLen = ", args.windLen, " - incorrect value. Must be > 0")
  578. return False
  579. if args.overlap <= 0:
  580. print("Error: overlap = ", args.overlap, " - incorrect value. Must be > 0")
  581. return False
  582. if args.rows <= 0:
  583. print("Error: rows = ", args.rows, " - incorrect value. Must be > 0")
  584. return False
  585. if args.cols <= 0:
  586. print("Error: cols = ", args.cols, " - incorrect value. Must be > 0")
  587. return False
  588. if args.xmarkup < 2:
  589. print("Error: xmarkup = ", args.xmarkup, " - incorrect value. Must be >= 2")
  590. return False
  591. if args.ymarkup < 2:
  592. print("Error: ymarkup = ", args.ymarkup, " - incorrect value. Must be >= 2")
  593. return False
  594. if args.zmarkup < 2:
  595. print("Error: zmarkup = ", args.zmarkup, " - incorrect value. Must be >= 2")
  596. return False
  597. if args.microTime <= 0:
  598. print("Error: microTime = ", args.microTime, " - incorrect value. Must be > 0")
  599. return False
  600. if args.frameSizeTime <= 0:
  601. print("Error: frameSizeTime = ", args.frameSizeTime, " - incorrect value. Must be > 0")
  602. return False
  603. if args.updateTime <= 0:
  604. print("Error: updateTime = ", args.updateTime, " - incorrect value. Must be > 0")
  605. return False
  606. if args.waitTime < 0:
  607. print("Error: waitTime = ", args.waitTime, " - incorrect value. Must be >= 0")
  608. return False
  609. return True
  610. if __name__ == "__main__":
  611. parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
  612. description='''this sample draws a volume graph and/or spectrogram of audio/video files and microphone\nDefault usage: ./Spectrogram.exe''')
  613. parser.add_argument("-i", "--inputType", dest="inputType", type=str, default="file", help="file or microphone")
  614. parser.add_argument("-d", "--draw", dest="draw", type=str, default="static",
  615. help="type of drawing: static - for plotting graph(s) across the entire input audio; dynamic - for plotting graph(s) in a time-updating window")
  616. parser.add_argument("-g", "--graph", dest="graph", type=str, default="ampl_and_spec",
  617. help="type of graph: amplitude graph or/and spectrogram. Please use tags below : ampl - draw the amplitude graph; spec - draw the spectrogram; ampl_and_spec - draw the amplitude graph and spectrogram on one image under each other")
  618. parser.add_argument("-a", "--audio", dest="audio", type=str, default='Megamind.avi',
  619. help="name and path to file")
  620. parser.add_argument("-s", "--audioStream", dest="audioStream", type=int, default=1,
  621. help=" CAP_PROP_AUDIO_STREAM value")
  622. parser.add_argument("-t", '--windowType', dest="windowType", type=str, default="Rect",
  623. help="type of window for STFT. Please use tags below : Rect/Hann/Hamming")
  624. parser.add_argument("-l", '--windLen', dest="windLen", type=int, default=256, help="size of window for STFT")
  625. parser.add_argument("-o", '--overlap', dest="overlap", type=int, default=128, help="overlap of windows for STFT")
  626. parser.add_argument("-gd", '--grid', dest="enableGrid", type=bool, default=False, help="grid on amplitude graph(on/off)")
  627. parser.add_argument("-r", '--rows', dest="rows", type=int, default=400, help="rows of output image")
  628. parser.add_argument("-c", '--cols', dest="cols", type=int, default=900, help="cols of output image")
  629. parser.add_argument("-x", '--xmarkup', dest="xmarkup", type=int, default=5,
  630. help="number of x axis divisions (time asix)")
  631. parser.add_argument("-y", '--ymarkup', dest="ymarkup", type=int, default=5,
  632. help="number of y axis divisions (frequency or/and amplitude axis)") # ?
  633. parser.add_argument("-z", '--zmarkup', dest="zmarkup", type=int, default=5,
  634. help="number of z axis divisions (colorbar)") # ?
  635. parser.add_argument("-m", '--microTime', dest="microTime", type=int, default=20,
  636. help="time of recording audio with microphone in seconds")
  637. parser.add_argument("-f", '--frameSizeTime', dest="frameSizeTime", type=int, default=5,
  638. help="size of sliding window in seconds")
  639. parser.add_argument("-u", '--updateTime', dest="updateTime", type=int, default=1,
  640. help="update time of sliding window in seconds")
  641. parser.add_argument("-w", '--waitTime', dest="waitTime", type=int, default=10,
  642. help="parameter to cv.waitKey() for dynamic update, takes values in milliseconds")
  643. args = parser.parse_args()
  644. AudioDrawing(args).Draw()