a8d0c51c69
* DSP: Implement Pipe 2 Pipe 2 is a DSP pipe that is used to initialize both the DSP hardware (the application signals to the DSP to initialize) and the application (the DSP provides the memory location of structures in the shared memory region). * AudioCore: Implement codecs (DecodeADPCM, DecodePCM8, DecodePCM16) * DSP Pipes: Implement as FIFO * AudioCore: File structure * AudioCore: More structure * AudioCore: Buffer management * DSP/Source: Reorganise Source's AdvanceFrame. * Audio Output * lolidk * huh? * interp * More interp stuff * oops * Zero State * Don't mix Source frame if it's not enabled * DSP: Forgot to zero a buffer, adjusted thread synchronisation, adjusted format spec for buffers * asdf * Get it to compile and tweak stretching a bit. * revert stretch test * deleted accidental partial catch submodule commit * new audio stretching algorithm * update .gitmodule * fix OS X build * remove getopt from rubberband * #include <stddef> to audio_core.h * typo * -framework Accelerate * OptionTransientsSmooth -> OptionTransientsCrisp * tweak stretch tempo smoothing coefficient. also switch back to smooth. * tweak mroe * remove printf * sola * #include <cmath> * VERY QUICK MERGE TO GET IT WORKING DOESN'T ACTIVATE AUDIO FILTERS * Reminder to self * fix comparison * common/thread: Correct code style * Thread: Make Barrier reusable * fix threading synchonisation code * add profiling code * print error to console when audio clips * fix metallic sound * reduce logspam
395 lines
13 KiB
C++
395 lines
13 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
///
|
|
/// MMX optimized routines. All MMX optimized functions have been gathered into
|
|
/// this single source code file, regardless to their class or original source
|
|
/// code file, in order to ease porting the library to other compiler and
|
|
/// processor platforms.
|
|
///
|
|
/// The MMX-optimizations are programmed using MMX compiler intrinsics that
|
|
/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
|
|
/// should compile with both toolsets.
|
|
///
|
|
/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
|
|
/// 6.0 processor pack" update to support compiler intrinsic syntax. The update
|
|
/// is available for download at Microsoft Developers Network, see here:
|
|
/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
|
|
///
|
|
/// Author : Copyright (c) Olli Parviainen
|
|
/// Author e-mail : oparviai 'at' iki.fi
|
|
/// SoundTouch WWW: http://www.surina.net/soundtouch
|
|
///
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Last changed : $Date: 2015-08-09 00:00:15 +0300 (Sun, 09 Aug 2015) $
|
|
// File revision : $Revision: 4 $
|
|
//
|
|
// $Id: mmx_optimized.cpp 226 2015-08-08 21:00:15Z oparviai $
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// License :
|
|
//
|
|
// SoundTouch audio processing library
|
|
// Copyright (c) Olli Parviainen
|
|
//
|
|
// This library is free software; you can redistribute it and/or
|
|
// modify it under the terms of the GNU Lesser General Public
|
|
// License as published by the Free Software Foundation; either
|
|
// version 2.1 of the License, or (at your option) any later version.
|
|
//
|
|
// This library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
// Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public
|
|
// License along with this library; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "STTypes.h"
|
|
|
|
#ifdef SOUNDTOUCH_ALLOW_MMX
|
|
// MMX routines available only with integer sample type
|
|
|
|
using namespace soundtouch;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// implementation of MMX optimized functions of class 'TDStretchMMX'
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "TDStretch.h"
|
|
#include <mmintrin.h>
|
|
#include <limits.h>
|
|
#include <math.h>
|
|
|
|
|
|
// Calculates cross correlation of two buffers
|
|
double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2, double &dnorm)
|
|
{
|
|
const __m64 *pVec1, *pVec2;
|
|
__m64 shifter;
|
|
__m64 accu, normaccu;
|
|
long corr, norm;
|
|
int i;
|
|
|
|
pVec1 = (__m64*)pV1;
|
|
pVec2 = (__m64*)pV2;
|
|
|
|
shifter = _m_from_int(overlapDividerBitsNorm);
|
|
normaccu = accu = _mm_setzero_si64();
|
|
|
|
// Process 4 parallel sets of 2 * stereo samples or 4 * mono samples
|
|
// during each round for improved CPU-level parallellization.
|
|
for (i = 0; i < channels * overlapLength / 16; i ++)
|
|
{
|
|
__m64 temp, temp2;
|
|
|
|
// dictionary of instructions:
|
|
// _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
|
|
// _mm_add_pi32 : 2*32bit add
|
|
// _m_psrad : 32bit right-shift
|
|
|
|
temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
|
|
_mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
|
|
temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]), shifter),
|
|
_mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec1[1]), shifter));
|
|
accu = _mm_add_pi32(accu, temp);
|
|
normaccu = _mm_add_pi32(normaccu, temp2);
|
|
|
|
temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
|
|
_mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
|
|
temp2 = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]), shifter),
|
|
_mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec1[3]), shifter));
|
|
accu = _mm_add_pi32(accu, temp);
|
|
normaccu = _mm_add_pi32(normaccu, temp2);
|
|
|
|
pVec1 += 4;
|
|
pVec2 += 4;
|
|
}
|
|
|
|
// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
|
|
// and finally store the result into the variable "corr"
|
|
|
|
accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
|
|
corr = _m_to_int(accu);
|
|
|
|
normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
|
|
norm = _m_to_int(normaccu);
|
|
|
|
// Clear MMS state
|
|
_m_empty();
|
|
|
|
if (norm > (long)maxnorm)
|
|
{
|
|
maxnorm = norm;
|
|
}
|
|
|
|
// Normalize result by dividing by sqrt(norm) - this step is easiest
|
|
// done using floating point operation
|
|
dnorm = (double)norm;
|
|
|
|
return (double)corr / sqrt(dnorm < 1e-9 ? 1.0 : dnorm);
|
|
// Note: Warning about the missing EMMS instruction is harmless
|
|
// as it'll be called elsewhere.
|
|
}
|
|
|
|
|
|
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
|
|
double TDStretchMMX::calcCrossCorrAccumulate(const short *pV1, const short *pV2, double &dnorm)
|
|
{
|
|
const __m64 *pVec1, *pVec2;
|
|
__m64 shifter;
|
|
__m64 accu;
|
|
long corr, lnorm;
|
|
int i;
|
|
|
|
// cancel first normalizer tap from previous round
|
|
lnorm = 0;
|
|
for (i = 1; i <= channels; i ++)
|
|
{
|
|
lnorm -= (pV1[-i] * pV1[-i]) >> overlapDividerBitsNorm;
|
|
}
|
|
|
|
pVec1 = (__m64*)pV1;
|
|
pVec2 = (__m64*)pV2;
|
|
|
|
shifter = _m_from_int(overlapDividerBitsNorm);
|
|
accu = _mm_setzero_si64();
|
|
|
|
// Process 4 parallel sets of 2 * stereo samples or 4 * mono samples
|
|
// during each round for improved CPU-level parallellization.
|
|
for (i = 0; i < channels * overlapLength / 16; i ++)
|
|
{
|
|
__m64 temp;
|
|
|
|
// dictionary of instructions:
|
|
// _m_pmaddwd : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
|
|
// _mm_add_pi32 : 2*32bit add
|
|
// _m_psrad : 32bit right-shift
|
|
|
|
temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]), shifter),
|
|
_mm_sra_pi32(_mm_madd_pi16(pVec1[1], pVec2[1]), shifter));
|
|
accu = _mm_add_pi32(accu, temp);
|
|
|
|
temp = _mm_add_pi32(_mm_sra_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]), shifter),
|
|
_mm_sra_pi32(_mm_madd_pi16(pVec1[3], pVec2[3]), shifter));
|
|
accu = _mm_add_pi32(accu, temp);
|
|
|
|
pVec1 += 4;
|
|
pVec2 += 4;
|
|
}
|
|
|
|
// copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
|
|
// and finally store the result into the variable "corr"
|
|
|
|
accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
|
|
corr = _m_to_int(accu);
|
|
|
|
// Clear MMS state
|
|
_m_empty();
|
|
|
|
// update normalizer with last samples of this round
|
|
pV1 = (short *)pVec1;
|
|
for (int j = 1; j <= channels; j ++)
|
|
{
|
|
lnorm += (pV1[-j] * pV1[-j]) >> overlapDividerBitsNorm;
|
|
}
|
|
dnorm += (double)lnorm;
|
|
|
|
if (lnorm > (long)maxnorm)
|
|
{
|
|
maxnorm = lnorm;
|
|
}
|
|
|
|
// Normalize result by dividing by sqrt(norm) - this step is easiest
|
|
// done using floating point operation
|
|
return (double)corr / sqrt((dnorm < 1e-9) ? 1.0 : dnorm);
|
|
}
|
|
|
|
|
|
void TDStretchMMX::clearCrossCorrState()
|
|
{
|
|
// Clear MMS state
|
|
_m_empty();
|
|
//_asm EMMS;
|
|
}
|
|
|
|
|
|
|
|
// MMX-optimized version of the function overlapStereo
|
|
void TDStretchMMX::overlapStereo(short *output, const short *input) const
|
|
{
|
|
const __m64 *pVinput, *pVMidBuf;
|
|
__m64 *pVdest;
|
|
__m64 mix1, mix2, adder, shifter;
|
|
int i;
|
|
|
|
pVinput = (const __m64*)input;
|
|
pVMidBuf = (const __m64*)pMidBuffer;
|
|
pVdest = (__m64*)output;
|
|
|
|
// mix1 = mixer values for 1st stereo sample
|
|
// mix1 = mixer values for 2nd stereo sample
|
|
// adder = adder for updating mixer values after each round
|
|
|
|
mix1 = _mm_set_pi16(0, overlapLength, 0, overlapLength);
|
|
adder = _mm_set_pi16(1, -1, 1, -1);
|
|
mix2 = _mm_add_pi16(mix1, adder);
|
|
adder = _mm_add_pi16(adder, adder);
|
|
|
|
// Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
|
|
// overlapDividerBits calculation earlier.
|
|
shifter = _m_from_int(overlapDividerBitsPure + 1);
|
|
|
|
for (i = 0; i < overlapLength / 4; i ++)
|
|
{
|
|
__m64 temp1, temp2;
|
|
|
|
// load & shuffle data so that input & mixbuffer data samples are paired
|
|
temp1 = _mm_unpacklo_pi16(pVMidBuf[0], pVinput[0]); // = i0l m0l i0r m0r
|
|
temp2 = _mm_unpackhi_pi16(pVMidBuf[0], pVinput[0]); // = i1l m1l i1r m1r
|
|
|
|
// temp = (temp .* mix) >> shifter
|
|
temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
|
|
temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
|
|
pVdest[0] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
|
|
|
|
// update mix += adder
|
|
mix1 = _mm_add_pi16(mix1, adder);
|
|
mix2 = _mm_add_pi16(mix2, adder);
|
|
|
|
// --- second round begins here ---
|
|
|
|
// load & shuffle data so that input & mixbuffer data samples are paired
|
|
temp1 = _mm_unpacklo_pi16(pVMidBuf[1], pVinput[1]); // = i2l m2l i2r m2r
|
|
temp2 = _mm_unpackhi_pi16(pVMidBuf[1], pVinput[1]); // = i3l m3l i3r m3r
|
|
|
|
// temp = (temp .* mix) >> shifter
|
|
temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
|
|
temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
|
|
pVdest[1] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
|
|
|
|
// update mix += adder
|
|
mix1 = _mm_add_pi16(mix1, adder);
|
|
mix2 = _mm_add_pi16(mix2, adder);
|
|
|
|
pVinput += 2;
|
|
pVMidBuf += 2;
|
|
pVdest += 2;
|
|
}
|
|
|
|
_m_empty(); // clear MMS state
|
|
}
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// implementation of MMX optimized functions of class 'FIRFilter'
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "FIRFilter.h"
|
|
|
|
|
|
FIRFilterMMX::FIRFilterMMX() : FIRFilter()
|
|
{
|
|
filterCoeffsAlign = NULL;
|
|
filterCoeffsUnalign = NULL;
|
|
}
|
|
|
|
|
|
FIRFilterMMX::~FIRFilterMMX()
|
|
{
|
|
delete[] filterCoeffsUnalign;
|
|
}
|
|
|
|
|
|
// (overloaded) Calculates filter coefficients for MMX routine
|
|
void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
|
|
{
|
|
uint i;
|
|
FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
|
|
|
|
// Ensure that filter coeffs array is aligned to 16-byte boundary
|
|
delete[] filterCoeffsUnalign;
|
|
filterCoeffsUnalign = new short[2 * newLength + 8];
|
|
filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
|
|
|
|
// rearrange the filter coefficients for mmx routines
|
|
for (i = 0;i < length; i += 4)
|
|
{
|
|
filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
|
|
filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
|
|
filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
|
|
filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
|
|
|
|
filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
|
|
filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
|
|
filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
|
|
filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// mmx-optimized version of the filter routine for stereo sound
|
|
uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
|
|
{
|
|
// Create stack copies of the needed member variables for asm routines :
|
|
uint i, j;
|
|
__m64 *pVdest = (__m64*)dest;
|
|
|
|
if (length < 2) return 0;
|
|
|
|
for (i = 0; i < (numSamples - length) / 2; i ++)
|
|
{
|
|
__m64 accu1;
|
|
__m64 accu2;
|
|
const __m64 *pVsrc = (const __m64*)src;
|
|
const __m64 *pVfilter = (const __m64*)filterCoeffsAlign;
|
|
|
|
accu1 = accu2 = _mm_setzero_si64();
|
|
for (j = 0; j < lengthDiv8 * 2; j ++)
|
|
{
|
|
__m64 temp1, temp2;
|
|
|
|
temp1 = _mm_unpacklo_pi16(pVsrc[0], pVsrc[1]); // = l2 l0 r2 r0
|
|
temp2 = _mm_unpackhi_pi16(pVsrc[0], pVsrc[1]); // = l3 l1 r3 r1
|
|
|
|
accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp1, pVfilter[0])); // += l2*f2+l0*f0 r2*f2+r0*f0
|
|
accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp2, pVfilter[1])); // += l3*f3+l1*f1 r3*f3+r1*f1
|
|
|
|
temp1 = _mm_unpacklo_pi16(pVsrc[1], pVsrc[2]); // = l4 l2 r4 r2
|
|
|
|
accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp2, pVfilter[0])); // += l3*f2+l1*f0 r3*f2+r1*f0
|
|
accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp1, pVfilter[1])); // += l4*f3+l2*f1 r4*f3+r2*f1
|
|
|
|
// accu1 += l2*f2+l0*f0 r2*f2+r0*f0
|
|
// += l3*f3+l1*f1 r3*f3+r1*f1
|
|
|
|
// accu2 += l3*f2+l1*f0 r3*f2+r1*f0
|
|
// l4*f3+l2*f1 r4*f3+r2*f1
|
|
|
|
pVfilter += 2;
|
|
pVsrc += 2;
|
|
}
|
|
// accu >>= resultDivFactor
|
|
accu1 = _mm_srai_pi32(accu1, resultDivFactor);
|
|
accu2 = _mm_srai_pi32(accu2, resultDivFactor);
|
|
|
|
// pack 2*2*32bits => 4*16 bits
|
|
pVdest[0] = _mm_packs_pi32(accu1, accu2);
|
|
src += 4;
|
|
pVdest ++;
|
|
}
|
|
|
|
_m_empty(); // clear emms state
|
|
|
|
return (numSamples & 0xfffffffe) - length;
|
|
}
|
|
|
|
#endif // SOUNDTOUCH_ALLOW_MMX
|