{-| module : Data.Number.Flint.FFT.FFI copyright : (c) 2022 Hartmut Monien license : GNU GPL, version 2 or above (see LICENSE) maintainer : hmonien@uni-bonn.de -} module Data.Number.Flint.FFT.FFI ( -- * Schoenhage-Strassen FFT -- * Split\/combine FFT coefficients fft_split_limbs , fft_split_bits , fft_combine_limbs , fft_combine_bits -- * Test helper functions , fermat_to_mpz -- * Arithmetic modulo a generalised Fermat number , mpn_negmod_2expp1 , mpn_addmod_2expp1_1 , mpn_normmod_2expp1 , mpn_mul_2expmod_2expp1 , mpn_div_2expmod_2expp1 -- * Generic butterflies , fft_adjust , fft_adjust_sqrt2 , butterfly_lshB , butterfly_rshB -- * Radix 2 transforms , fft_butterfly , ifft_butterfly , fft_radix2 , fft_truncate , fft_truncate1 , ifft_radix2 , ifft_truncate , ifft_truncate1 , fft_butterfly_sqrt2 , ifft_butterfly_sqrt2 , fft_truncate_sqrt2 , ifft_truncate_sqrt2 -- * Matrix Fourier Transforms , fft_butterfly_twiddle , ifft_butterfly_twiddle , fft_radix2_twiddle , ifft_radix2_twiddle , fft_truncate1_twiddle , ifft_truncate1_twiddle , fft_mfa_truncate_sqrt2 , ifft_mfa_truncate_sqrt2 , fft_mfa_truncate_sqrt2_outer , fft_mfa_truncate_sqrt2_inner , ifft_mfa_truncate_sqrt2_outer -- * Negacyclic multiplication , fft_negacyclic , ifft_negacyclic , fft_naive_convolution_1 , _fft_mulmod_2expp1 , fft_adjust_limbs , fft_mulmod_2expp1 -- * Integer multiplication , mul_truncate_sqrt2 , mul_mfa_truncate_sqrt2 , flint_mpn_mul_fft_main -- * Convolution , fft_convolution -- * FFT Precaching , fft_precache , fft_convolution_precache ) where -- Schoenhage-Strassen FFT ----------------------------------------------------- import Foreign.C.String import Foreign.C.Types import Foreign.ForeignPtr import Foreign.Ptr ( Ptr, FunPtr, plusPtr ) import Foreign.Storable import Foreign.Marshal ( free ) import Data.Number.Flint.Flint #include #include -- Split\/combine FFT coefficients --------------------------------------------- -- | /fft_split_limbs/ /poly/ /limbs/ /total_limbs/ /coeff_limbs/ /output_limbs/ -- -- Split an integer @(limbs, total_limbs)@ into coefficients of length -- @coeff_limbs@ limbs and store as the coefficients of @poly@ which are -- assumed to have space for @output_limbs + 1@ limbs per coefficient. The -- coefficients of the polynomial do not need to be zeroed before calling -- this function, however the number of coefficients written is returned by -- the function and any coefficients beyond this point are not touched. foreign import ccall "fft.h fft_split_limbs" fft_split_limbs :: Ptr (Ptr CMpLimb) -> Ptr CMp -> CMpSize -> CMpSize -> CMpSize -> IO CMpSize -- | /fft_split_bits/ /poly/ /limbs/ /total_limbs/ /bits/ /output_limbs/ -- -- Split an integer @(limbs, total_limbs)@ into coefficients of the given -- number of @bits@ and store as the coefficients of @poly@ which are -- assumed to have space for @output_limbs + 1@ limbs per coefficient. The -- coefficients of the polynomial do not need to be zeroed before calling -- this function, however the number of coefficients written is returned by -- the function and any coefficients beyond this point are not touched. foreign import ccall "fft.h fft_split_bits" fft_split_bits :: Ptr (Ptr CMpLimb) -> Ptr CMp -> CMpSize -> CFBitCnt -> CMpSize -> IO CMpSize -- | /fft_combine_limbs/ /res/ /poly/ /length/ /coeff_limbs/ /output_limbs/ /total_limbs/ -- -- Evaluate the polynomial @poly@ of the given @length@ at @B^coeff_limbs@, -- where @B = 2^FLINT_BITS@, and add the result to the integer -- @(res, total_limbs)@ throwing away any bits that exceed the given number -- of limbs. The polynomial coefficients are assumed to have at least -- @output_limbs@ limbs each, however any additional limbs are ignored. -- -- If the integer is initially zero the result will just be the evaluation -- of the polynomial. foreign import ccall "fft.h fft_combine_limbs" fft_combine_limbs :: Ptr CMpLimb -> Ptr (Ptr CMpLimb) -> CLong -> CMpSize -> CMpSize -> CMpSize -> IO () -- | /fft_combine_bits/ /res/ /poly/ /length/ /bits/ /output_limbs/ /total_limbs/ -- -- Evaluate the polynomial @poly@ of the given @length@ at @2^bits@ and add -- the result to the integer @(res, total_limbs)@ throwing away any bits -- that exceed the given number of limbs. The polynomial coefficients are -- assumed to have at least @output_limbs@ limbs each, however any -- additional limbs are ignored. If the integer is initially zero the -- result will just be the evaluation of the polynomial. foreign import ccall "fft.h fft_combine_bits" fft_combine_bits :: Ptr CMpLimb -> Ptr (Ptr CMpLimb) -> CLong -> CFBitCnt -> CMpSize -> CMpSize -> IO () -- Test helper functions ------------------------------------------------------- -- | /fermat_to_mpz/ /m/ /i/ /limbs/ -- -- Convert the Fermat number @(i, limbs)@ modulo @B^limbs + 1@ to an -- @mpz_t m@. Assumes @m@ has been initialised. This function is used only -- in test code. foreign import ccall "fft.h fermat_to_mpz" fermat_to_mpz :: Ptr CMpz -> Ptr CMpLimb -> CMpSize -> IO () -- Arithmetic modulo a generalised Fermat number ------------------------------- -- | /mpn_negmod_2expp1/ /z/ /a/ /limbs/ -- -- Set @z@ to the negation of the Fermat number \(a\) modulo @B^limbs + 1@. -- The input @a@ is expected to be fully reduced, and the output is fully -- reduced. Aliasing is permitted. foreign import ccall "fft.h mpn_negmod_2expp1" mpn_negmod_2expp1 :: Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> IO () -- | /mpn_addmod_2expp1_1/ /r/ /limbs/ /c/ -- -- Adds the signed limb @c@ to the generalised Fermat number @r@ modulo -- @B^limbs + 1@. The compiler should be able to inline this for the case -- that there is no overflow from the first limb. foreign import ccall "fft.h mpn_addmod_2expp1_1" mpn_addmod_2expp1_1 :: Ptr CMpLimb -> CMpSize -> Ptr CMpSLimb-> IO () -- | /mpn_normmod_2expp1/ /t/ /limbs/ -- -- Given @t@ a signed integer of @limbs + 1@ limbs in two\'s complement -- format, reduce @t@ to the corresponding value modulo the generalised -- Fermat number @B^limbs + 1@, where @B = 2^FLINT_BITS@. foreign import ccall "fft.h mpn_normmod_2expp1" mpn_normmod_2expp1 :: Ptr CMpLimb -> CMpSize -> IO () -- | /mpn_mul_2expmod_2expp1/ /t/ /i1/ /limbs/ /d/ -- -- Given @i1@ a signed integer of @limbs + 1@ limbs in two\'s complement -- format reduced modulo @B^limbs + 1@ up to some overflow, compute -- @t = i1*2^d@ modulo \(p\). The result will not necessarily be fully -- reduced. The number of bits @d@ must be nonnegative and less than -- @FLINT_BITS@. Aliasing is permitted. foreign import ccall "fft.h mpn_mul_2expmod_2expp1" mpn_mul_2expmod_2expp1 :: Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CFBitCnt -> IO () -- | /mpn_div_2expmod_2expp1/ /t/ /i1/ /limbs/ /d/ -- -- Given @i1@ a signed integer of @limbs + 1@ limbs in two\'s complement -- format reduced modulo @B^limbs + 1@ up to some overflow, compute -- @t = i1\/2^d@ modulo \(p\). The result will not necessarily be fully -- reduced. The number of bits @d@ must be nonnegative and less than -- @FLINT_BITS@. Aliasing is permitted. foreign import ccall "fft.h mpn_div_2expmod_2expp1" mpn_div_2expmod_2expp1 :: Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CFBitCnt -> IO () -- Generic butterflies --------------------------------------------------------- -- | /fft_adjust/ /r/ /i1/ /i/ /limbs/ /w/ -- -- Set @r@ to @i1@ times \(z^i\) modulo @B^limbs + 1@ where \(z\) -- corresponds to multiplication by \(2^w\). This can be thought of as part -- of a butterfly operation. We require \(0 \leq i < n\) where \(nw =\) -- @limbs*FLINT_BITS@. Aliasing is not supported. foreign import ccall "fft.h fft_adjust" fft_adjust :: Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CFBitCnt -> IO () -- | /fft_adjust_sqrt2/ /r/ /i1/ /i/ /limbs/ /w/ /temp/ -- -- Set @r@ to @i1@ times \(z^i\) modulo @B^limbs + 1@ where \(z\) -- corresponds to multiplication by \(\sqrt{2}^w\). This can be thought of -- as part of a butterfly operation. We require \(0 \leq i < 2\cdot n\) and -- odd where \(nw =\) @limbs*FLINT_BITS@. foreign import ccall "fft.h fft_adjust_sqrt2" fft_adjust_sqrt2 :: Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CFBitCnt -> Ptr CMpLimb -> IO () -- | /butterfly_lshB/ /t/ /u/ /i1/ /i2/ /limbs/ /x/ /y/ -- -- We are given two integers @i1@ and @i2@ modulo @B^limbs + 1@ which are -- not necessarily normalised. We compute @t = (i1 + i2)*B^x@ and -- @u = (i1 - i2)*B^y@ modulo \(p\). Aliasing between inputs and outputs is -- not permitted. We require @x@ and @y@ to be less than @limbs@ and -- nonnegative. foreign import ccall "fft.h butterfly_lshB" butterfly_lshB :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CMpSize -> IO () -- | /butterfly_rshB/ /t/ /u/ /i1/ /i2/ /limbs/ /x/ /y/ -- -- We are given two integers @i1@ and @i2@ modulo @B^limbs + 1@ which are -- not necessarily normalised. We compute @t = (i1 + i2)\/B^x@ and -- @u = (i1 - i2)\/B^y@ modulo \(p\). Aliasing between inputs and outputs -- is not permitted. We require @x@ and @y@ to be less than @limbs@ and -- nonnegative. foreign import ccall "fft.h butterfly_rshB" butterfly_rshB :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CMpSize -> IO () -- Radix 2 transforms ---------------------------------------------------------- -- | /fft_butterfly/ /s/ /t/ /i1/ /i2/ /i/ /limbs/ /w/ -- -- Set @s = i1 + i2@, @t = z1^i*(i1 - i2)@ modulo @B^limbs + 1@ where -- @z1 = exp(Pi*I\/n)@ corresponds to multiplication by \(2^w\). Requires -- \(0 \leq i < n\) where \(nw =\) @limbs*FLINT_BITS@. foreign import ccall "fft.h fft_butterfly" fft_butterfly :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CFBitCnt -> IO () -- | /ifft_butterfly/ /s/ /t/ /i1/ /i2/ /i/ /limbs/ /w/ -- -- Set @s = i1 + z1^i*i2@, @t = i1 - z1^i*i2@ modulo @B^limbs + 1@ where -- @z1 = exp(-Pi*I\/n)@ corresponds to division by \(2^w\). Requires -- \(0 \leq i < 2n\) where \(nw =\) @limbs*FLINT_BITS@. foreign import ccall "fft.h ifft_butterfly" ifft_butterfly :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CFBitCnt -> IO () -- | /fft_radix2/ /ii/ /n/ /w/ /t1/ /t2/ -- -- The radix 2 DIF FFT works as follows: -- -- Input: @[i0, i1, ..., i(m-1)]@, for \(m = 2n\) a power of \(2\). -- -- Output: @[r0, r1, ..., r(m-1)]@ @= FFT[i0, i1, ..., i(m-1)]@. -- -- Algorithm: -- -- \(\bullet\) Recursively compute @[r0, r2, r4, ...., r(m-2)]@ --     @= FFT[i0+i(m\/2), i1+i(m\/2+1), ..., i(m\/2-1)+i(m-1)]@ -- \(\bullet\) Let @[t0, t1, ..., t(m\/2-1)]@ --     @= [i0-i(m\/2), i1-i(m\/2+1), ..., i(m\/2-1)-i(m-1)]@ -- \(\bullet\) Let @[u0, u1, ..., u(m\/2-1)]@ --     @= [z1^0*t0, z1^1*t1, ..., z1^(m\/2-1)*t(m\/2-1)]@ --     where @z1 = exp(2*Pi*I\/m)@ corresponds to multiplication by -- \(2^w\). -- \(\bullet\) Recursively compute @[r1, r3, ..., r(m-1)]@ --     @= FFT[u0, u1, ..., u(m\/2-1)]@ -- -- The parameters are as follows: -- -- \(\bullet\) @2*n@ is the length of the input and output arrays -- -- [\(\bullet\) \(w\) is such that \(2^w\) is an \(2n\)-th root of unity in the ring \(\mathbf{Z}/p\mathbf{Z}\) that we are working in, i.e. \(p = 2^{wn} + 1\) (here \(n\) is divisible by] -- @GMP_LIMB_BITS@) -- -- [\(\bullet\) @ii@ is the array of inputs (each input is an] -- array of limbs of length @wn\/GMP_LIMB_BITS + 1@ (the extra limbs -- being a \"carry limb\"). Outputs are written in-place. -- -- We require \(nw\) to be at least 64 and the two temporary space pointers -- to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h fft_radix2" fft_radix2 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> IO () -- | /fft_truncate/ /ii/ /n/ /w/ /t1/ /t2/ /trunc/ -- -- As for @fft_radix2@ except that only the first @trunc@ coefficients of -- the output are computed and the input is regarded as having (implied) -- zero coefficients from coefficient @trunc@ onwards. The coefficients -- must exist as the algorithm needs to use this extra space, but their -- value is irrelevant. The value of @trunc@ must be divisible by 2. foreign import ccall "fft.h fft_truncate" fft_truncate :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> IO () -- | /fft_truncate1/ /ii/ /n/ /w/ /t1/ /t2/ /trunc/ -- -- As for @fft_radix2@ except that only the first @trunc@ coefficients of -- the output are computed. The transform still needs all \(2n\) input -- coefficients to be specified. foreign import ccall "fft.h fft_truncate1" fft_truncate1 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> IO () -- | /ifft_radix2/ /ii/ /n/ /w/ /t1/ /t2/ -- -- The radix 2 DIF IFFT works as follows: -- -- Input: @[i0, i1, ..., i(m-1)]@, for \(m = 2n\) a power of \(2\). -- -- [Output: @[r0, r1, ..., r(m-1)]@] -- @= IFFT[i0, i1, ..., i(m-1)]@. -- -- Algorithm: -- -- [\(\bullet\) Recursively compute @[s0, s1, ...., s(m\/2-1)]@] -- @= IFFT[i0, i2, ..., i(m-2)]@ -- -- [\(\bullet\) Recursively compute @[t(m\/2), t(m\/2+1), ..., t(m-1)]@] -- @= IFFT[i1, i3, ..., i(m-1)]@ -- -- [\(\bullet\) Let @[r0, r1, ..., r(m\/2-1)]@] -- @= [s0+z1^0*t0, s1+z1^1*t1, ..., s(m\/2-1)+z1^(m\/2-1)*t(m\/2-1)]@ -- where @z1 = exp(-2*Pi*I\/m)@ corresponds to division by \(2^w\). -- -- [\(\bullet\) Let @[r(m\/2), r(m\/2+1), ..., r(m-1)]@] -- @= [s0-z1^0*t0, s1-z1^1*t1, ..., s(m\/2-1)-z1^(m\/2-1)*t(m\/2-1)]@ -- -- The parameters are as follows: -- -- [\(\bullet\) @2*n@ is the length of the input and output] -- arrays -- -- [\(\bullet\) \(w\) is such that \(2^w\) is an \(2n\)-th root of unity in the ring \(\mathbf{Z}/p\mathbf{Z}\) that we are working in, i.e. \(p = 2^{wn} + 1\) (here \(n\) is divisible by] -- @GMP_LIMB_BITS@) -- -- \(\bullet\) @ii@ is the array of inputs (each input is an array of limbs -- of length @wn\/GMP_LIMB_BITS + 1@ (the extra limbs being a \"carry -- limb\"). Outputs are written in-place. -- -- We require \(nw\) to be at least 64 and the two temporary space pointers -- to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h ifft_radix2" ifft_radix2 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> IO () -- | /ifft_truncate/ /ii/ /n/ /w/ /t1/ /t2/ /trunc/ -- -- As for @ifft_radix2@ except that the output is assumed to have zeros -- from coefficient trunc onwards and only the first trunc coefficients of -- the input are specified. The remaining coefficients need to exist as the -- extra space is needed, but their value is irrelevant. The value of -- @trunc@ must be divisible by 2. -- -- Although the implementation does not require it, we assume for -- simplicity that @trunc@ is greater than \(n\). The algorithm begins by -- computing the inverse transform of the first \(n\) coefficients of the -- input array. The unspecified coefficients of the second half of the -- array are then written: coefficient @trunc + i@ is computed as a twist -- of coefficient @i@ by a root of unity. The values of these coefficients -- are then equal to what they would have been if the inverse transform of -- the right hand side of the input array had been computed with full data -- from the start. The function @ifft_truncate1@ is then called on the -- entire right half of the input array with this auxiliary data filled in. -- Finally a single layer of the IFFT is completed on all the coefficients -- up to @trunc@ being careful to note that this involves doubling the -- coefficients from @trunc - n@ up to @n@. foreign import ccall "fft.h ifft_truncate" ifft_truncate :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> IO () -- | /ifft_truncate1/ /ii/ /n/ /w/ /t1/ /t2/ /trunc/ -- -- Computes the first @trunc@ coefficients of the radix 2 inverse transform -- assuming the first @trunc@ coefficients are given and that the remaining -- coefficients have been set to the value they would have if an inverse -- transform had already been applied with full data. -- -- The algorithm is the same as for @ifft_truncate@ except that the -- coefficients from @trunc@ onwards after the inverse transform are not -- inferred to be zero but the supplied values. foreign import ccall "fft.h ifft_truncate1" ifft_truncate1 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> IO () -- | /fft_butterfly_sqrt2/ /s/ /t/ /i1/ /i2/ /i/ /limbs/ /w/ /temp/ -- -- Let \(w = 2k + 1\), \(i = 2j + 1\). Set @s = i1 + i2@, -- @t = z1^i*(i1 - i2)@ modulo @B^limbs + 1@ where @z1^2 = exp(Pi*I\/n)@ -- corresponds to multiplication by \(2^w\). Requires \(0 \leq i < 2n\) -- where \(nw =\) @limbs*FLINT_BITS@. -- -- Here @z1@ corresponds to multiplication by \(2^k\) then multiplication -- by @(2^(3nw\/4) - 2^(nw\/4))@. We see @z1^i@ corresponds to -- multiplication by @(2^(3nw\/4) - 2^(nw\/4))*2^(j+ik)@. -- -- We first multiply by @2^(j + ik + wn\/4)@ then multiply by an additional -- @2^(nw\/2)@ and subtract. foreign import ccall "fft.h fft_butterfly_sqrt2" fft_butterfly_sqrt2 :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CFBitCnt -> Ptr CMpLimb -> IO () -- | /ifft_butterfly_sqrt2/ /s/ /t/ /i1/ /i2/ /i/ /limbs/ /w/ /temp/ -- -- Let \(w = 2k + 1\), \(i = 2j + 1\). Set @s = i1 + z1^i*i2@, -- @t = i1 - z1^i*i2@ modulo @B^limbs + 1@ where @z1^2 = exp(-Pi*I\/n)@ -- corresponds to division by \(2^w\). Requires \(0 \leq i < 2n\) where -- \(nw =\) @limbs*FLINT_BITS@. -- -- Here @z1@ corresponds to division by \(2^k\) then division by -- @(2^(3nw\/4) - 2^(nw\/4))@. We see @z1^i@ corresponds to division by -- @(2^(3nw\/4) - 2^(nw\/4))*2^(j+ik)@ which is the same as division by -- @2^(j+ik + 1)@ then multiplication by @(2^(3nw\/4) - 2^(nw\/4))@. -- -- Of course, division by @2^(j+ik + 1)@ is the same as multiplication by -- @2^(2*wn - j - ik - 1)@. The exponent is positive as -- \(i \leq 2\cdot n\), \(j < n\), \(k < w/2\). -- -- We first multiply by @2^(2*wn - j - ik - 1 + wn\/4)@ then multiply by an -- additional @2^(nw\/2)@ and subtract. foreign import ccall "fft.h ifft_butterfly_sqrt2" ifft_butterfly_sqrt2 :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> CFBitCnt -> Ptr CMpLimb -> IO () -- | /fft_truncate_sqrt2/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ /trunc/ -- -- As per @fft_truncate@ except that the transform is twice the usual -- length, i.e. length \(4n\) rather than \(2n\). This is achieved by -- making use of twiddles by powers of a square root of 2, not powers of 2 -- in the first layer of the transform. -- -- We require \(nw\) to be at least 64 and the three temporary space -- pointers to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h fft_truncate_sqrt2" fft_truncate_sqrt2 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> IO () -- | /ifft_truncate_sqrt2/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ /trunc/ -- -- As per @ifft_truncate@ except that the transform is twice the usual -- length, i.e. length \(4n\) instead of \(2n\). This is achieved by making -- use of twiddles by powers of a square root of 2, not powers of 2 in the -- final layer of the transform. -- -- We require \(nw\) to be at least 64 and the three temporary space -- pointers to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h ifft_truncate_sqrt2" ifft_truncate_sqrt2 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> IO () -- Matrix Fourier Transforms --------------------------------------------------- -- | /fft_butterfly_twiddle/ /u/ /v/ /s/ /t/ /limbs/ /b1/ /b2/ -- -- Set @u = 2^b1*(s + t)@, @v = 2^b2*(s - t)@ modulo @B^limbs + 1@. This is -- used to compute @u = 2^(ws*tw1)*(s + t)@, @v = 2^(w+ws*tw2)*(s - t)@ in -- the matrix Fourier algorithm, i.e. effectively computing an ordinary -- butterfly with additional twiddles by @z1^rc@ for row \(r\) and column -- \(c\) of the matrix of coefficients. Aliasing is not allowed. foreign import ccall "fft.h fft_butterfly_twiddle" fft_butterfly_twiddle :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CFBitCnt -> CFBitCnt -> IO () -- | /ifft_butterfly_twiddle/ /u/ /v/ /s/ /t/ /limbs/ /b1/ /b2/ -- -- Set @u = s\/2^b1 + t\/2^b1)@, @v = s\/2^b1 - t\/2^b1@ modulo -- @B^limbs + 1@. This is used to compute -- @u = 2^(-ws*tw1)*s + 2^(-ws*tw2)*t)@, -- @v = 2^(-ws*tw1)*s + 2^(-ws*tw2)*t)@ in the matrix Fourier algorithm, -- i.e. effectively computing an ordinary butterfly with additional -- twiddles by @z1^(-rc)@ for row \(r\) and column \(c\) of the matrix of -- coefficients. Aliasing is not allowed. foreign import ccall "fft.h ifft_butterfly_twiddle" ifft_butterfly_twiddle :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CFBitCnt -> CFBitCnt -> IO () -- | /fft_radix2_twiddle/ /ii/ /is/ /n/ /w/ /t1/ /t2/ /ws/ /r/ /c/ /rs/ -- -- As for @fft_radix2@ except that the coefficients are spaced by @is@ in -- the array @ii@ and an additional twist by @z^c*i@ is applied to each -- coefficient where \(i\) starts at \(r\) and increases by @rs@ as one -- moves from one coefficient to the next. Here @z@ corresponds to -- multiplication by @2^ws@. foreign import ccall "fft.h fft_radix2_twiddle" fft_radix2_twiddle :: Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CMpSize -> CMpSize -> IO () -- | /ifft_radix2_twiddle/ /ii/ /is/ /n/ /w/ /t1/ /t2/ /ws/ /r/ /c/ /rs/ -- -- As for @ifft_radix2@ except that the coefficients are spaced by @is@ in -- the array @ii@ and an additional twist by @z^(-c*i)@ is applied to each -- coefficient where \(i\) starts at \(r\) and increases by @rs@ as one -- moves from one coefficient to the next. Here @z@ corresponds to -- multiplication by @2^ws@. foreign import ccall "fft.h ifft_radix2_twiddle" ifft_radix2_twiddle :: Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CMpSize -> CMpSize -> IO () -- | /fft_truncate1_twiddle/ /ii/ /is/ /n/ /w/ /t1/ /t2/ /ws/ /r/ /c/ /rs/ /trunc/ -- -- As per @fft_radix2_twiddle@ except that the transform is truncated as -- per @fft_truncate1@. foreign import ccall "fft.h fft_truncate1_twiddle" fft_truncate1_twiddle :: Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CMpSize -> CMpSize -> CMpSize -> IO () -- | /ifft_truncate1_twiddle/ /ii/ /is/ /n/ /w/ /t1/ /t2/ /ws/ /r/ /c/ /rs/ /trunc/ -- -- As per @ifft_radix2_twiddle@ except that the transform is truncated as -- per @ifft_truncate1@. foreign import ccall "fft.h ifft_truncate1_twiddle" ifft_truncate1_twiddle :: Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> CMpSize -> CMpSize -> CMpSize -> IO () -- | /fft_mfa_truncate_sqrt2/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ /n1/ /trunc/ -- -- This is as per the @fft_truncate_sqrt2@ function except that the matrix -- Fourier algorithm is used for the left and right FFTs. The total -- transform length is \(4n\) where @n = 2^depth@ so that the left and -- right transforms are both length \(2n\). We require @trunc > 2*n@ and -- that @trunc@ is divisible by @2*n1@ (explained below). The coefficients -- are produced in an order different from @fft_truncate_sqrt2@. -- -- The matrix Fourier algorithm, which is applied to each transform of -- length \(2n\), works as follows. We set @n1@ to a power of 2 about the -- square root of \(n\). The data is then thought of as a set of @n2@ rows -- each with @n1@ columns (so that @n1*n2 = 2n@). -- -- The length \(2n\) transform is then computed using a whole pile of short -- transforms. These comprise @n1@ column transforms of length @n2@ -- followed by some twiddles by roots of unity (namely @z^rc@ where \(r\) -- is the row and \(c\) the column within the data) followed by @n2@ row -- transforms of length @n1@. Along the way the data needs to be rearranged -- due to the fact that the short transforms output the data in binary -- reversed order compared with what is needed. -- -- The matrix Fourier algorithm provides better cache locality by -- decomposing the long length \(2n\) transforms into many transforms of -- about the square root of the original length. -- -- For better cache locality the sqrt2 layer of the full length \(4n\) -- transform is folded in with the column FFTs performed as part of the -- first matrix Fourier algorithm on the left half of the data. -- -- The second half of the data requires a truncated version of the matrix -- Fourier algorithm. This is achieved by truncating to an exact multiple -- of the row length so that the row transforms are full length. Moreover, -- the column transforms will then be truncated transforms and their -- truncated length needs to be a multiple of 2. This explains the -- condition on @trunc@ given above. -- -- To improve performance, the extra twiddles by roots of unity are -- combined with the butterflies performed at the last layer of the column -- transforms. -- -- We require \(nw\) to be at least 64 and the three temporary space -- pointers to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h fft_mfa_truncate_sqrt2" fft_mfa_truncate_sqrt2 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> IO () -- | /ifft_mfa_truncate_sqrt2/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ /n1/ /trunc/ -- -- This is as per the @ifft_truncate_sqrt2@ function except that the matrix -- Fourier algorithm is used for the left and right IFFTs. The total -- transform length is \(4n\) where @n = 2^depth@ so that the left and -- right transforms are both length \(2n\). We require @trunc > 2*n@ and -- that @trunc@ is divisible by @2*n1@. -- -- We set @n1@ to a power of 2 about the square root of \(n\). -- -- As per the matrix fourier FFT the sqrt2 layer is folded into the final -- column IFFTs for better cache locality and the extra twiddles that occur -- in the matrix Fourier algorithm are combined with the butterflied -- performed at the first layer of the final column transforms. -- -- We require \(nw\) to be at least 64 and the three temporary space -- pointers to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h ifft_mfa_truncate_sqrt2" ifft_mfa_truncate_sqrt2 :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> IO () -- | /fft_mfa_truncate_sqrt2_outer/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ /n1/ /trunc/ -- -- Just the outer layers of @fft_mfa_truncate_sqrt2@. foreign import ccall "fft.h fft_mfa_truncate_sqrt2_outer" fft_mfa_truncate_sqrt2_outer :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> IO () -- | /fft_mfa_truncate_sqrt2_inner/ /ii/ /jj/ /n/ /w/ /t1/ /t2/ /temp/ /n1/ /trunc/ /tt/ -- -- The inner layers of @fft_mfa_truncate_sqrt2@ and -- @ifft_mfa_truncate_sqrt2@ combined with pointwise mults. foreign import ccall "fft.h fft_mfa_truncate_sqrt2_inner" fft_mfa_truncate_sqrt2_inner :: Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> Ptr CMpLimb -> IO () -- | /ifft_mfa_truncate_sqrt2_outer/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ /n1/ /trunc/ -- -- The outer layers of @ifft_mfa_truncate_sqrt2@ combined with -- normalisation. foreign import ccall "fft.h ifft_mfa_truncate_sqrt2_outer" ifft_mfa_truncate_sqrt2_outer :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CMpSize -> CMpSize -> IO () -- Negacyclic multiplication --------------------------------------------------- -- | /fft_negacyclic/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ -- -- As per @fft_radix2@ except that it performs a sqrt2 negacyclic transform -- of length \(2n\). This is the same as the radix 2 transform except that -- the \(i\)-th coefficient of the input is first multiplied by -- \(\sqrt{2}^{iw}\). -- -- We require \(nw\) to be at least 64 and the two temporary space pointers -- to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h fft_negacyclic" fft_negacyclic :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> IO () -- | /ifft_negacyclic/ /ii/ /n/ /w/ /t1/ /t2/ /temp/ -- -- As per @ifft_radix2@ except that it performs a sqrt2 negacyclic inverse -- transform of length \(2n\). This is the same as the radix 2 inverse -- transform except that the \(i\)-th coefficient of the output is finally -- divided by \(\sqrt{2}^{iw}\). -- -- We require \(nw\) to be at least 64 and the two temporary space pointers -- to point to blocks of size @n*w + FLINT_BITS@ bits. foreign import ccall "fft.h ifft_negacyclic" ifft_negacyclic :: Ptr (Ptr CMpLimb) -> CMpSize -> CFBitCnt -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> IO () -- | /fft_naive_convolution_1/ /r/ /ii/ /jj/ /m/ -- -- Performs a naive negacyclic convolution of @ii@ with @jj@, both of -- length \(m\), and sets \(r\) to the result. This is essentially -- multiplication of polynomials modulo \(x^m + 1\). foreign import ccall "fft.h fft_naive_convolution_1" fft_naive_convolution_1 :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> IO () -- | /_fft_mulmod_2expp1/ /r1/ /i1/ /i2/ /r_limbs/ /depth/ /w/ -- -- Multiply @i1@ by @i2@ modulo @B^r_limbs + 1@ where -- @r_limbs = nw\/FLINT_BITS@ with @n = 2^depth@. Uses the negacyclic FFT -- convolution CRT\'d with a 1 limb naive convolution. We require that -- @depth@ and @w@ have been selected as per the wrapper -- @fft_mulmod_2expp1@ below. foreign import ccall "fft.h _fft_mulmod_2expp1" _fft_mulmod_2expp1 :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CFBitCnt -> CFBitCnt -> IO () -- | /fft_adjust_limbs/ /limbs/ -- -- Given a number of limbs, returns a new number of limbs (no more than the -- next power of 2) which will work with the Nussbaumer code. It is only -- necessary to make this adjustment if @limbs > FFT_MULMOD_2EXPP1_CUTOFF@. foreign import ccall "fft.h fft_adjust_limbs" fft_adjust_limbs :: CMpSize -> IO CLong -- | /fft_mulmod_2expp1/ /r/ /i1/ /i2/ /n/ /w/ /tt/ -- -- As per @_fft_mulmod_2expp1@ but with a tuned cutoff below which more -- classical methods are used for the convolution. The temporary space is -- required to fit @n*w + FLINT_BITS@ bits. There are no restrictions on -- \(n\), but if @limbs = n*w\/FLINT_BITS@ then if @limbs@ exceeds -- @FFT_MULMOD_2EXPP1_CUTOFF@ the function @fft_adjust_limbs@ must be -- called to increase the number of limbs to an appropriate value. foreign import ccall "fft.h fft_mulmod_2expp1" fft_mulmod_2expp1 :: Ptr CMpLimb -> Ptr CMpLimb -> Ptr CMpLimb -> CMpSize -> CMpSize -> Ptr CMpLimb -> IO () -- Integer multiplication ------------------------------------------------------ -- | /mul_truncate_sqrt2/ /r1/ /i1/ /n1/ /i2/ /n2/ /depth/ /w/ -- -- Integer multiplication using the radix 2 truncated sqrt2 transforms. -- -- Set @(r1, n1 + n2)@ to the product of @(i1, n1)@ by @(i2, n2)@. This is -- achieved through an FFT convolution of length at most @2^(depth + 2)@ -- with coefficients of size \(nw\) bits where @n = 2^depth@. We require -- @depth >= 6@. The input data is broken into chunks of data not exceeding -- @(nw - (depth + 1))\/2@ bits. If breaking the first integer into chunks -- of this size results in @j1@ coefficients and breaking the second -- integer results in @j2@ chunks then @j1 + j2 - 1 \<= 2^(depth + 2)@. -- -- If @n = 2^depth@ then we require \(nw\) to be at least 64. foreign import ccall "fft.h mul_truncate_sqrt2" mul_truncate_sqrt2 :: Ptr CMp -> Ptr CMp -> CMpSize -> Ptr CMp -> CMpSize -> CFBitCnt -> CFBitCnt -> IO () -- | /mul_mfa_truncate_sqrt2/ /r1/ /i1/ /n1/ /i2/ /n2/ /depth/ /w/ -- -- As for @mul_truncate_sqrt2@ except that the cache friendly matrix -- Fourier algorithm is used. -- -- If @n = 2^depth@ then we require \(nw\) to be at least 64. Here we also -- require \(w\) to be \(2^i\) for some \(i \geq 0\). foreign import ccall "fft.h mul_mfa_truncate_sqrt2" mul_mfa_truncate_sqrt2 :: Ptr CMp -> Ptr CMp -> CMpSize -> Ptr CMp -> CMpSize -> CFBitCnt -> CFBitCnt -> IO () -- | /flint_mpn_mul_fft_main/ /r1/ /i1/ /n1/ /i2/ /n2/ -- -- The main integer multiplication routine. Sets @(r1, n1 + n2)@ to -- @(i1, n1)@ times @(i2, n2)@. We require @n1 >= n2 > 0@. foreign import ccall "fft.h flint_mpn_mul_fft_main" flint_mpn_mul_fft_main :: Ptr CMp -> Ptr CMp -> CMpSize -> Ptr CMp -> CMpSize -> IO () -- Convolution ----------------------------------------------------------------- -- | /fft_convolution/ /ii/ /jj/ /depth/ /limbs/ /trunc/ /t1/ /t2/ /s1/ /tt/ -- -- Perform an FFT convolution of @ii@ with @jj@, both of length @4*n@ where -- @n = 2^depth@. Assume that all but the first @trunc@ coefficients of the -- output (placed in @ii@) are zero. Each coefficient is taken modulo -- @B^limbs + 1@. The temporary spaces @t1@, @t2@ and @s1@ must have -- @limbs + 1@ limbs of space and @tt@ must have @2*(limbs + 1)@ of free -- space. foreign import ccall "fft.h fft_convolution" fft_convolution :: Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CLong -> CLong -> CLong -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr CMpLimb -> IO () -- FFT Precaching -------------------------------------------------------------- -- | /fft_precache/ /jj/ /depth/ /limbs/ /trunc/ /t1/ /t2/ /s1/ -- -- Precompute the FFT of @jj@ for use with precache functions. The -- parameters are as for @fft_convolution@. foreign import ccall "fft.h fft_precache" fft_precache :: Ptr (Ptr CMpLimb) -> CLong -> CLong -> CLong -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> IO () -- | /fft_convolution_precache/ /ii/ /jj/ /depth/ /limbs/ /trunc/ /t1/ /t2/ /s1/ /tt/ -- -- As per @fft_convolution@ except that it is assumed @fft_precache@ has -- been called on @jj@ with the same parameters. This will then run faster -- than if @fft_convolution@ had been run with the original @jj@. foreign import ccall "fft.h fft_convolution_precache" fft_convolution_precache :: Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> CLong -> CLong -> CLong -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> Ptr (Ptr CMpLimb) -> IO ()