mirror of
https://github.com/sle118/squeezelite-esp32.git
synced 2025-12-09 13:07:03 +03:00
338 lines
11 KiB
C
338 lines
11 KiB
C
/* ***** BEGIN LICENSE BLOCK *****
|
|
* Source last modified: $Id: dct4.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $
|
|
*
|
|
* Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
|
|
*
|
|
* The contents of this file, and the files included with this file,
|
|
* are subject to the current version of the RealNetworks Public
|
|
* Source License (the "RPSL") available at
|
|
* http://www.helixcommunity.org/content/rpsl unless you have licensed
|
|
* the file under the current version of the RealNetworks Community
|
|
* Source License (the "RCSL") available at
|
|
* http://www.helixcommunity.org/content/rcsl, in which case the RCSL
|
|
* will apply. You may also obtain the license terms directly from
|
|
* RealNetworks. You may not use this file except in compliance with
|
|
* the RPSL or, if you have a valid RCSL with RealNetworks applicable
|
|
* to this file, the RCSL. Please see the applicable RPSL or RCSL for
|
|
* the rights, obligations and limitations governing use of the
|
|
* contents of the file.
|
|
*
|
|
* This file is part of the Helix DNA Technology. RealNetworks is the
|
|
* developer of the Original Code and owns the copyrights in the
|
|
* portions it created.
|
|
*
|
|
* This file, and the files included with this file, is distributed
|
|
* and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
|
|
* KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
|
|
* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
|
|
* ENJOYMENT OR NON-INFRINGEMENT.
|
|
*
|
|
* Technology Compatibility Kit Test Suite(s) Location:
|
|
* http://www.helixcommunity.org/content/tck
|
|
*
|
|
* Contributor(s):
|
|
*
|
|
* ***** END LICENSE BLOCK ***** */
|
|
|
|
/**************************************************************************************
|
|
* Fixed-point HE-AAC decoder
|
|
* Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
|
|
* February 2005
|
|
*
|
|
* dct4.c - optimized DCT-IV
|
|
**************************************************************************************/
|
|
|
|
#include "coder.h"
|
|
#include "assembly.h"
|
|
|
|
static const int nmdctTab[NUM_IMDCT_SIZES] PROGMEM = {128, 1024};
|
|
static const int postSkip[NUM_IMDCT_SIZES] PROGMEM = {15, 1};
|
|
|
|
/**************************************************************************************
|
|
* Function: PreMultiply
|
|
*
|
|
* Description: pre-twiddle stage of DCT4
|
|
*
|
|
* Inputs: table index (for transform size)
|
|
* buffer of nmdct samples
|
|
*
|
|
* Outputs: processed samples in same buffer
|
|
*
|
|
* Return: none
|
|
*
|
|
* Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
|
|
* i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
|
|
* normalization by -1/N is rolled into tables here (see trigtabs.c)
|
|
* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
|
|
**************************************************************************************/
|
|
static void PreMultiply(int tabidx, int *zbuf1)
|
|
{
|
|
int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
|
|
int t, cms2, cps2a, sin2a, cps2b, sin2b;
|
|
int *zbuf2;
|
|
const int *csptr;
|
|
|
|
nmdct = nmdctTab[tabidx];
|
|
zbuf2 = zbuf1 + nmdct - 1;
|
|
csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
|
|
|
|
/* whole thing should fit in registers - verify that compiler does this */
|
|
for (i = nmdct >> 2; i != 0; i--) {
|
|
/* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
|
|
cps2a = *csptr++;
|
|
sin2a = *csptr++;
|
|
cps2b = *csptr++;
|
|
sin2b = *csptr++;
|
|
|
|
ar1 = *(zbuf1 + 0);
|
|
ai2 = *(zbuf1 + 1);
|
|
ai1 = *(zbuf2 + 0);
|
|
ar2 = *(zbuf2 - 1);
|
|
|
|
/* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
|
|
* max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
|
|
* i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
|
|
* extra sign bits, and eat one in adding
|
|
*/
|
|
t = MULSHIFT32(sin2a, ar1 + ai1);
|
|
z2 = MULSHIFT32(cps2a, ai1) - t;
|
|
cms2 = cps2a - 2*sin2a;
|
|
z1 = MULSHIFT32(cms2, ar1) + t;
|
|
*zbuf1++ = z1; /* cos*ar1 + sin*ai1 */
|
|
*zbuf1++ = z2; /* cos*ai1 - sin*ar1 */
|
|
|
|
t = MULSHIFT32(sin2b, ar2 + ai2);
|
|
z2 = MULSHIFT32(cps2b, ai2) - t;
|
|
cms2 = cps2b - 2*sin2b;
|
|
z1 = MULSHIFT32(cms2, ar2) + t;
|
|
*zbuf2-- = z2; /* cos*ai2 - sin*ar2 */
|
|
*zbuf2-- = z1; /* cos*ar2 + sin*ai2 */
|
|
}
|
|
}
|
|
|
|
/**************************************************************************************
|
|
* Function: PostMultiply
|
|
*
|
|
* Description: post-twiddle stage of DCT4
|
|
*
|
|
* Inputs: table index (for transform size)
|
|
* buffer of nmdct samples
|
|
*
|
|
* Outputs: processed samples in same buffer
|
|
*
|
|
* Return: none
|
|
*
|
|
* Notes: minimum 1 GB in, 2 GB out - gains 2 int bits
|
|
* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
|
|
**************************************************************************************/
|
|
static void PostMultiply(int tabidx, int *fft1)
|
|
{
|
|
int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
|
|
int t, cms2, cps2, sin2;
|
|
int *fft2;
|
|
const int *csptr;
|
|
|
|
nmdct = nmdctTab[tabidx];
|
|
csptr = cos1sin1tab;
|
|
skipFactor = postSkip[tabidx];
|
|
fft2 = fft1 + nmdct - 1;
|
|
|
|
/* load coeffs for first pass
|
|
* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
|
|
*/
|
|
cps2 = *csptr++;
|
|
sin2 = *csptr;
|
|
csptr += skipFactor;
|
|
cms2 = cps2 - 2*sin2;
|
|
|
|
for (i = nmdct >> 2; i != 0; i--) {
|
|
ar1 = *(fft1 + 0);
|
|
ai1 = *(fft1 + 1);
|
|
ar2 = *(fft2 - 1);
|
|
ai2 = *(fft2 + 0);
|
|
|
|
/* gain 2 ints bit from MULSHIFT32 by Q30
|
|
* max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
|
|
* i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
|
|
* extra sign bits, and eat one in adding
|
|
*/
|
|
t = MULSHIFT32(sin2, ar1 + ai1);
|
|
*fft2-- = t - MULSHIFT32(cps2, ai1); /* sin*ar1 - cos*ai1 */
|
|
*fft1++ = t + MULSHIFT32(cms2, ar1); /* cos*ar1 + sin*ai1 */
|
|
cps2 = *csptr++;
|
|
sin2 = *csptr;
|
|
csptr += skipFactor;
|
|
|
|
ai2 = -ai2;
|
|
t = MULSHIFT32(sin2, ar2 + ai2);
|
|
*fft2-- = t - MULSHIFT32(cps2, ai2); /* sin*ar1 - cos*ai1 */
|
|
cms2 = cps2 - 2*sin2;
|
|
*fft1++ = t + MULSHIFT32(cms2, ar2); /* cos*ar1 + sin*ai1 */
|
|
}
|
|
}
|
|
|
|
/**************************************************************************************
|
|
* Function: PreMultiplyRescale
|
|
*
|
|
* Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
|
|
*
|
|
* Inputs: table index (for transform size)
|
|
* buffer of nmdct samples
|
|
* number of guard bits to add to input before processing
|
|
*
|
|
* Outputs: processed samples in same buffer
|
|
*
|
|
* Return: none
|
|
*
|
|
* Notes: see notes on PreMultiply(), above
|
|
**************************************************************************************/
|
|
/* __attribute__ ((section (".data"))) */ static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
|
|
{
|
|
int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
|
|
int t, cms2, cps2a, sin2a, cps2b, sin2b;
|
|
int *zbuf2;
|
|
const int *csptr;
|
|
|
|
nmdct = nmdctTab[tabidx];
|
|
zbuf2 = zbuf1 + nmdct - 1;
|
|
csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
|
|
|
|
/* whole thing should fit in registers - verify that compiler does this */
|
|
for (i = nmdct >> 2; i != 0; i--) {
|
|
/* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
|
|
cps2a = *csptr++;
|
|
sin2a = *csptr++;
|
|
cps2b = *csptr++;
|
|
sin2b = *csptr++;
|
|
|
|
ar1 = *(zbuf1 + 0) >> es;
|
|
ai1 = *(zbuf2 + 0) >> es;
|
|
ai2 = *(zbuf1 + 1) >> es;
|
|
|
|
t = MULSHIFT32(sin2a, ar1 + ai1);
|
|
z2 = MULSHIFT32(cps2a, ai1) - t;
|
|
cms2 = cps2a - 2*sin2a;
|
|
z1 = MULSHIFT32(cms2, ar1) + t;
|
|
*zbuf1++ = z1;
|
|
*zbuf1++ = z2;
|
|
|
|
ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */
|
|
|
|
t = MULSHIFT32(sin2b, ar2 + ai2);
|
|
z2 = MULSHIFT32(cps2b, ai2) - t;
|
|
cms2 = cps2b - 2*sin2b;
|
|
z1 = MULSHIFT32(cms2, ar2) + t;
|
|
*zbuf2-- = z2;
|
|
*zbuf2-- = z1;
|
|
|
|
}
|
|
}
|
|
|
|
/**************************************************************************************
|
|
* Function: PostMultiplyRescale
|
|
*
|
|
* Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
|
|
*
|
|
* Inputs: table index (for transform size)
|
|
* buffer of nmdct samples
|
|
* number of guard bits to remove from output
|
|
*
|
|
* Outputs: processed samples in same buffer
|
|
*
|
|
* Return: none
|
|
*
|
|
* Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
|
|
* see notes on PostMultiply(), above
|
|
**************************************************************************************/
|
|
/* __attribute__ ((section (".data"))) */ static void PostMultiplyRescale(int tabidx, int *fft1, int es)
|
|
{
|
|
int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
|
|
int t, cs2, sin2;
|
|
int *fft2;
|
|
const int *csptr;
|
|
|
|
nmdct = nmdctTab[tabidx];
|
|
csptr = cos1sin1tab;
|
|
skipFactor = postSkip[tabidx];
|
|
fft2 = fft1 + nmdct - 1;
|
|
|
|
/* load coeffs for first pass
|
|
* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
|
|
*/
|
|
cs2 = *csptr++;
|
|
sin2 = *csptr;
|
|
csptr += skipFactor;
|
|
|
|
for (i = nmdct >> 2; i != 0; i--) {
|
|
ar1 = *(fft1 + 0);
|
|
ai1 = *(fft1 + 1);
|
|
ai2 = *(fft2 + 0);
|
|
|
|
t = MULSHIFT32(sin2, ar1 + ai1);
|
|
z = t - MULSHIFT32(cs2, ai1);
|
|
CLIP_2N_SHIFT(z, es);
|
|
*fft2-- = z;
|
|
cs2 -= 2*sin2;
|
|
z = t + MULSHIFT32(cs2, ar1);
|
|
CLIP_2N_SHIFT(z, es);
|
|
*fft1++ = z;
|
|
|
|
cs2 = *csptr++;
|
|
sin2 = *csptr;
|
|
csptr += skipFactor;
|
|
|
|
ar2 = *fft2;
|
|
ai2 = -ai2;
|
|
t = MULSHIFT32(sin2, ar2 + ai2);
|
|
z = t - MULSHIFT32(cs2, ai2);
|
|
CLIP_2N_SHIFT(z, es);
|
|
*fft2-- = z;
|
|
cs2 -= 2*sin2;
|
|
z = t + MULSHIFT32(cs2, ar2);
|
|
CLIP_2N_SHIFT(z, es);
|
|
*fft1++ = z;
|
|
cs2 += 2*sin2;
|
|
}
|
|
}
|
|
|
|
/**************************************************************************************
|
|
* Function: DCT4
|
|
*
|
|
* Description: type-IV DCT
|
|
*
|
|
* Inputs: table index (for transform size)
|
|
* buffer of nmdct samples
|
|
* number of guard bits in the input buffer
|
|
*
|
|
* Outputs: processed samples in same buffer
|
|
*
|
|
* Return: none
|
|
*
|
|
* Notes: operates in-place
|
|
* if number of guard bits in input is < GBITS_IN_DCT4, the input is
|
|
* scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
|
|
* the DCT4 (rare)
|
|
* the output has FBITS_LOST_DCT4 fewer fraction bits than the input
|
|
* the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
|
|
* int bits gained per stage (PreMul + FFT + PostMul)
|
|
* short blocks = (-5 + 4 + 2) = 1 total
|
|
* long blocks = (-8 + 7 + 2) = 1 total
|
|
**************************************************************************************/
|
|
void DCT4(int tabidx, int *coef, int gb)
|
|
{
|
|
int es;
|
|
|
|
/* fast in-place DCT-IV - adds guard bits if necessary */
|
|
if (gb < GBITS_IN_DCT4) {
|
|
es = GBITS_IN_DCT4 - gb;
|
|
PreMultiplyRescale(tabidx, coef, es);
|
|
R4FFT(tabidx, coef);
|
|
PostMultiplyRescale(tabidx, coef, es);
|
|
} else {
|
|
PreMultiply(tabidx, coef);
|
|
R4FFT(tabidx, coef);
|
|
PostMultiply(tabidx, coef);
|
|
}
|
|
}
|