diff --git a/p256/src/arithmetic.rs b/p256/src/arithmetic.rs index 7cdf8b1d..9cdd94f1 100644 --- a/p256/src/arithmetic.rs +++ b/p256/src/arithmetic.rs @@ -8,7 +8,6 @@ pub(crate) mod field; #[cfg(feature = "hash2curve")] mod hash2curve; pub(crate) mod scalar; -pub(crate) mod util; use self::{field::FieldElement, scalar::Scalar}; use crate::NistP256; diff --git a/p256/src/arithmetic/field.rs b/p256/src/arithmetic/field.rs index 97d2cc2b..ddca0897 100644 --- a/p256/src/arithmetic/field.rs +++ b/p256/src/arithmetic/field.rs @@ -2,17 +2,18 @@ #![allow(clippy::assign_op_pattern, clippy::op_ref)] -use crate::{ - arithmetic::util::{adc, mac, sbb, u256_to_u64x4, u64x4_to_u256}, - FieldBytes, -}; +#[cfg_attr(target_pointer_width = "32", path = "field/field32.rs")] +#[cfg_attr(target_pointer_width = "64", path = "field/field64.rs")] +mod field_impl; + +use crate::FieldBytes; use core::{ iter::{Product, Sum}, ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign}, }; use elliptic_curve::ops::Invert; use elliptic_curve::{ - bigint::{ArrayEncoding, U256}, + bigint::{ArrayEncoding, U256, U512}, ff::{Field, PrimeField}, rand_core::RngCore, subtle::{Choice, ConditionallySelectable, ConstantTimeEq, ConstantTimeLess, CtOption}, @@ -116,33 +117,17 @@ impl FieldElement { /// Returns self + rhs mod p pub const fn add(&self, rhs: &Self) -> Self { - let a = u256_to_u64x4(self.0); - let b = u256_to_u64x4(rhs.0); - - // Bit 256 of p is set, so addition can result in five words. - let (w0, carry) = adc(a[0], b[0], 0); - let (w1, carry) = adc(a[1], b[1], carry); - let (w2, carry) = adc(a[2], b[2], carry); - let (w3, w4) = adc(a[3], b[3], carry); - - // Attempt to subtract the modulus, to ensure the result is in the field. - let modulus = u256_to_u64x4(MODULUS.0); - let (result, _) = Self::sub_inner( - w0, w1, w2, w3, w4, modulus[0], modulus[1], modulus[2], modulus[3], 0, - ); - result + Self(field_impl::add(self.0, rhs.0)) } - /// Returns 2*self. + /// Returns 2 * self. pub const fn double(&self) -> Self { self.add(self) } /// Returns self - rhs mod p pub const fn sub(&self, rhs: &Self) -> Self { - let a = u256_to_u64x4(self.0); - let b = u256_to_u64x4(rhs.0); - Self::sub_inner(a[0], a[1], a[2], a[3], 0, b[0], b[1], b[2], b[3], 0).0 + Self(field_impl::sub(self.0, rhs.0)) } /// Negate element. @@ -150,137 +135,10 @@ impl FieldElement { Self::sub(&Self::ZERO, self) } - fn from_bytes_wide(bytes: [u8; 64]) -> Self { - #[allow(clippy::unwrap_used)] - FieldElement::montgomery_reduce( - u64::from_be_bytes(bytes[0..8].try_into().unwrap()), - u64::from_be_bytes(bytes[8..16].try_into().unwrap()), - u64::from_be_bytes(bytes[16..24].try_into().unwrap()), - u64::from_be_bytes(bytes[24..32].try_into().unwrap()), - u64::from_be_bytes(bytes[32..40].try_into().unwrap()), - u64::from_be_bytes(bytes[40..48].try_into().unwrap()), - u64::from_be_bytes(bytes[48..56].try_into().unwrap()), - u64::from_be_bytes(bytes[56..64].try_into().unwrap()), - ) - } - - #[inline] - #[allow(clippy::too_many_arguments)] - const fn sub_inner( - l0: u64, - l1: u64, - l2: u64, - l3: u64, - l4: u64, - r0: u64, - r1: u64, - r2: u64, - r3: u64, - r4: u64, - ) -> (Self, u64) { - let (w0, borrow) = sbb(l0, r0, 0); - let (w1, borrow) = sbb(l1, r1, borrow); - let (w2, borrow) = sbb(l2, r2, borrow); - let (w3, borrow) = sbb(l3, r3, borrow); - let (_, borrow) = sbb(l4, r4, borrow); - - // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise - // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the - // modulus. - let modulus = u256_to_u64x4(MODULUS.0); - let (w0, carry) = adc(w0, modulus[0] & borrow, 0); - let (w1, carry) = adc(w1, modulus[1] & borrow, carry); - let (w2, carry) = adc(w2, modulus[2] & borrow, carry); - let (w3, _) = adc(w3, modulus[3] & borrow, carry); - - (Self(u64x4_to_u256([w0, w1, w2, w3])), borrow) - } - - /// Montgomery Reduction - /// - /// The general algorithm is: - /// ```text - /// A <- input (2n b-limbs) - /// for i in 0..n { - /// k <- A[i] p' mod b - /// A <- A + k p b^i - /// } - /// A <- A / b^n - /// if A >= p { - /// A <- A - p - /// } - /// ``` - /// - /// For secp256r1, we have the following simplifications: - /// - /// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A. - /// - /// - The first limb of p is 2^64 - 1; multiplications by this limb can be simplified - /// to a shift and subtraction: - /// ```text - /// a_i * (2^64 - 1) = a_i * 2^64 - a_i = (a_i << 64) - a_i - /// ``` - /// However, because `p' = 1`, the first limb of p is multiplied by limb i of the - /// intermediate A and then immediately added to that same limb, so we simply - /// initialize the carry to limb i of the intermediate. - /// - /// - The third limb of p is zero, so we can ignore any multiplications by it and just - /// add the carry. - /// - /// References: - /// - Handbook of Applied Cryptography, Chapter 14 - /// Algorithm 14.32 - /// http://cacr.uwaterloo.ca/hac/about/chap14.pdf - /// - /// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256 - /// Algorithm 7) Montgomery Word-by-Word Reduction - /// https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf - #[inline] - #[allow(clippy::too_many_arguments)] - const fn montgomery_reduce( - r0: u64, - r1: u64, - r2: u64, - r3: u64, - r4: u64, - r5: u64, - r6: u64, - r7: u64, - ) -> Self { - let modulus = u256_to_u64x4(MODULUS.0); - - let (r1, carry) = mac(r1, r0, modulus[1], r0); - let (r2, carry) = adc(r2, 0, carry); - let (r3, carry) = mac(r3, r0, modulus[3], carry); - let (r4, carry2) = adc(r4, 0, carry); - - let (r2, carry) = mac(r2, r1, modulus[1], r1); - let (r3, carry) = adc(r3, 0, carry); - let (r4, carry) = mac(r4, r1, modulus[3], carry); - let (r5, carry2) = adc(r5, carry2, carry); - - let (r3, carry) = mac(r3, r2, modulus[1], r2); - let (r4, carry) = adc(r4, 0, carry); - let (r5, carry) = mac(r5, r2, modulus[3], carry); - let (r6, carry2) = adc(r6, carry2, carry); - - let (r4, carry) = mac(r4, r3, modulus[1], r3); - let (r5, carry) = adc(r5, 0, carry); - let (r6, carry) = mac(r6, r3, modulus[3], carry); - let (r7, r8) = adc(r7, carry2, carry); - - // Result may be within MODULUS of the correct value - let (result, _) = Self::sub_inner( - r4, r5, r6, r7, r8, modulus[0], modulus[1], modulus[2], modulus[3], 0, - ); - result - } - /// Translate a field element out of the Montgomery domain. #[inline] pub(crate) const fn to_canonical(self) -> Self { - let w = u256_to_u64x4(self.0); - FieldElement::montgomery_reduce(w[0], w[1], w[2], w[3], 0, 0, 0, 0) + Self(field_impl::to_canonical(self.0)) } /// Translate a field element into the Montgomery domain. @@ -291,31 +149,8 @@ impl FieldElement { /// Returns self * rhs mod p pub const fn multiply(&self, rhs: &Self) -> Self { - // Schoolbook multiplication. - let a = u256_to_u64x4(self.0); - let b = u256_to_u64x4(rhs.0); - - let (w0, carry) = mac(0, a[0], b[0], 0); - let (w1, carry) = mac(0, a[0], b[1], carry); - let (w2, carry) = mac(0, a[0], b[2], carry); - let (w3, w4) = mac(0, a[0], b[3], carry); - - let (w1, carry) = mac(w1, a[1], b[0], 0); - let (w2, carry) = mac(w2, a[1], b[1], carry); - let (w3, carry) = mac(w3, a[1], b[2], carry); - let (w4, w5) = mac(w4, a[1], b[3], carry); - - let (w2, carry) = mac(w2, a[2], b[0], 0); - let (w3, carry) = mac(w3, a[2], b[1], carry); - let (w4, carry) = mac(w4, a[2], b[2], carry); - let (w5, w6) = mac(w5, a[2], b[3], carry); - - let (w3, carry) = mac(w3, a[3], b[0], 0); - let (w4, carry) = mac(w4, a[3], b[1], carry); - let (w5, carry) = mac(w5, a[3], b[2], carry); - let (w6, w7) = mac(w6, a[3], b[3], carry); - - FieldElement::montgomery_reduce(w0, w1, w2, w3, w4, w5, w6, w7) + let (lo, hi): (U256, U256) = self.0.split_mul(&rhs.0); + Self(field_impl::montgomery_reduce(lo, hi)) } /// Returns self * self mod p @@ -420,7 +255,8 @@ impl Field for FieldElement { // negligible bias from the uniform distribution. let mut buf = [0; 64]; rng.fill_bytes(&mut buf); - FieldElement::from_bytes_wide(buf) + let buf = U512::from_be_slice(&buf); + Self(field_impl::from_bytes_wide(buf)) } #[must_use] @@ -666,9 +502,13 @@ impl<'a> Product<&'a FieldElement> for FieldElement { #[cfg(test)] mod tests { - use super::{u64x4_to_u256, FieldElement}; + use super::FieldElement; use crate::{test_vectors::field::DBL_TEST_VECTORS, FieldBytes}; use core::ops::Mul; + + #[cfg(target_pointer_width = "64")] + use crate::U256; + #[cfg(target_pointer_width = "64")] use proptest::{num::u64::ANY, prelude::*}; #[test] @@ -783,6 +623,7 @@ mod tests { assert_eq!(four.sqrt().unwrap(), two); } + #[cfg(target_pointer_width = "64")] proptest! { /// This checks behaviour well within the field ranges, because it doesn't set the /// highest limb. @@ -795,8 +636,8 @@ mod tests { b1 in ANY, b2 in ANY, ) { - let a = FieldElement(u64x4_to_u256([a0, a1, a2, 0])); - let b = FieldElement(u64x4_to_u256([b0, b1, b2, 0])); + let a = FieldElement(U256::from_words([a0, a1, a2, 0])); + let b = FieldElement(U256::from_words([b0, b1, b2, 0])); assert_eq!(a.add(&b).sub(&a), b); } } diff --git a/p256/src/arithmetic/field/field32.rs b/p256/src/arithmetic/field/field32.rs new file mode 100644 index 00000000..0ec6c78a --- /dev/null +++ b/p256/src/arithmetic/field/field32.rs @@ -0,0 +1,264 @@ +//! 32-bit secp256r1 field element algorithms. + +use super::MODULUS; +use elliptic_curve::bigint::{Limb, U256, U512}; + +pub(super) const fn add(a: U256, b: U256) -> U256 { + let a = a.as_limbs(); + let b = b.as_limbs(); + + // Bit 256 of p is set, so addition can result in nine words. + // let (w0, carry) = adc(a[0], b[0], 0); + let (w0, carry) = a[0].adc(b[0], Limb::ZERO); + let (w1, carry) = a[1].adc(b[1], carry); + let (w2, carry) = a[2].adc(b[2], carry); + let (w3, carry) = a[3].adc(b[3], carry); + let (w4, carry) = a[4].adc(b[4], carry); + let (w5, carry) = a[5].adc(b[5], carry); + let (w6, carry) = a[6].adc(b[6], carry); + let (w7, w8) = a[7].adc(b[7], carry); + // Attempt to subtract the modulus, to ensure the result is in the field. + let modulus = MODULUS.0.as_limbs(); + + let (result, _) = sub_inner( + [w0, w1, w2, w3, w4, w5, w6, w7, w8], + [ + modulus[0], + modulus[1], + modulus[2], + modulus[3], + modulus[4], + modulus[5], + modulus[6], + modulus[7], + Limb::ZERO, + ], + ); + U256::new([ + result[0], result[1], result[2], result[3], result[4], result[5], result[6], result[7], + ]) +} + +pub(super) const fn sub(a: U256, b: U256) -> U256 { + let a = a.as_limbs(); + let b = b.as_limbs(); + + let (result, _) = sub_inner( + [a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], Limb::ZERO], + [b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], Limb::ZERO], + ); + U256::new([ + result[0], result[1], result[2], result[3], result[4], result[5], result[6], result[7], + ]) +} + +#[inline] +pub(super) const fn to_canonical(a: U256) -> U256 { + montgomery_reduce(a, U256::ZERO) +} + +pub(super) fn from_bytes_wide(a: U512) -> U256 { + let words = a.to_limbs(); + montgomery_reduce( + U256::new([ + words[8], words[9], words[10], words[11], words[12], words[13], words[14], words[15], + ]), + U256::new([ + words[0], words[1], words[2], words[3], words[4], words[5], words[6], words[7], + ]), + ) +} + +/// Montgomery Reduction +/// +/// The general algorithm is: +/// ```text +/// A <- input (2n b-limbs) +/// for i in 0..n { +/// k <- A[i] p' mod b +/// A <- A + k p b^i +/// } +/// A <- A / b^n +/// if A >= p { +/// A <- A - p +/// } +/// ``` +/// +/// For secp256r1, with a 32-bit arithmetic, we have the following +/// simplifications: +/// +/// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A. +/// +/// - The first limb of p is 2^32 - 1; multiplications by this limb can be simplified +/// to a shift and subtraction: +/// ```text +/// a_i * (2^32 - 1) = a_i * 2^32 - a_i = (a_i << 32) - a_i +/// ``` +/// However, because `p' = 1`, the first limb of p is multiplied by limb i of the +/// intermediate A and then immediately added to that same limb, so we simply +/// initialize the carry to limb i of the intermediate. +/// +/// The same applies for the second and third limb. +/// +/// - The fourth limb of p is zero, so we can ignore any multiplications by it and just +/// add the carry. +/// +/// The same applies for the fifth and sixth limb. +/// +/// - The seventh limb of p is one, so we can substitute a `mac` operation with a `adc` one. +/// +/// References: +/// - Handbook of Applied Cryptography, Chapter 14 +/// Algorithm 14.32 +/// http://cacr.uwaterloo.ca/hac/about/chap14.pdf +/// +/// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256 +/// Algorithm 7) Montgomery Word-by-Word Reduction +/// https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf +#[inline] +#[allow(clippy::too_many_arguments)] +pub(super) const fn montgomery_reduce(lo: U256, hi: U256) -> U256 { + let lo = lo.as_limbs(); + let hi = hi.as_limbs(); + + let a0 = lo[0]; + let a1 = lo[1]; + let a2 = lo[2]; + let a3 = lo[3]; + let a4 = lo[4]; + let a5 = lo[5]; + let a6 = lo[6]; + let a7 = lo[7]; + let a8 = hi[0]; + let a9 = hi[1]; + let a10 = hi[2]; + let a11 = hi[3]; + let a12 = hi[4]; + let a13 = hi[5]; + let a14 = hi[6]; + let a15 = hi[7]; + + let modulus = MODULUS.0.as_limbs(); + + /* + * let (a0, c) = (0, a0); + * let (a1, c) = (a1, a0); + * let (a2, c) = (a2, a0); + */ + let (a3, carry) = a3.adc(Limb::ZERO, a0); + let (a4, carry) = a4.adc(Limb::ZERO, carry); + let (a5, carry) = a5.adc(Limb::ZERO, carry); + let (a6, carry) = a6.adc(a0, carry); + // NOTE `modulus[7]` is 2^32 - 1, this could be optimized to `adc` and `sbb` + // but multiplication costs 1 clock-cycle on several architectures, + // thanks to parallelization + let (a7, carry) = a7.mac(a0, modulus[7], carry); + /* optimization with only adc and sbb + * let (x, _) = sbb(0, a0, 0); + * let (y, _) = sbb(a0, 0, (a0 != 0) as u32); + * + * (a7, carry) = adc(a7, x, carry); + * (carry, _) = adc(y, 0, carry); + */ + let (a8, carry2) = a8.adc(Limb::ZERO, carry); + + let (a4, carry) = a4.adc(Limb::ZERO, a1); + let (a5, carry) = a5.adc(Limb::ZERO, carry); + let (a6, carry) = a6.adc(Limb::ZERO, carry); + let (a7, carry) = a7.adc(a1, carry); + let (a8, carry) = a8.mac(a1, modulus[7], carry); + let (a9, carry2) = a9.adc(carry2, carry); + + let (a5, carry) = a5.adc(Limb::ZERO, a2); + let (a6, carry) = a6.adc(Limb::ZERO, carry); + let (a7, carry) = a7.adc(Limb::ZERO, carry); + let (a8, carry) = a8.adc(a2, carry); + let (a9, carry) = a9.mac(a2, modulus[7], carry); + let (a10, carry2) = a10.adc(carry2, carry); + + let (a6, carry) = a6.adc(Limb::ZERO, a3); + let (a7, carry) = a7.adc(Limb::ZERO, carry); + let (a8, carry) = a8.adc(Limb::ZERO, carry); + let (a9, carry) = a9.adc(a3, carry); + let (a10, carry) = a10.mac(a3, modulus[7], carry); + let (a11, carry2) = a11.adc(carry2, carry); + + let (a7, carry) = a7.adc(Limb::ZERO, a4); + let (a8, carry) = a8.adc(Limb::ZERO, carry); + let (a9, carry) = a9.adc(Limb::ZERO, carry); + let (a10, carry) = a10.adc(a4, carry); + let (a11, carry) = a11.mac(a4, modulus[7], carry); + let (a12, carry2) = a12.adc(carry2, carry); + + let (a8, carry) = a8.adc(Limb::ZERO, a5); + let (a9, carry) = a9.adc(Limb::ZERO, carry); + let (a10, carry) = a10.adc(Limb::ZERO, carry); + let (a11, carry) = a11.adc(a5, carry); + let (a12, carry) = a12.mac(a5, modulus[7], carry); + let (a13, carry2) = a13.adc(carry2, carry); + + let (a9, carry) = a9.adc(Limb::ZERO, a6); + let (a10, carry) = a10.adc(Limb::ZERO, carry); + let (a11, carry) = a11.adc(Limb::ZERO, carry); + let (a12, carry) = a12.adc(a6, carry); + let (a13, carry) = a13.mac(a6, modulus[7], carry); + let (a14, carry2) = a14.adc(carry2, carry); + + let (a10, carry) = a10.adc(Limb::ZERO, a7); + let (a11, carry) = a11.adc(Limb::ZERO, carry); + let (a12, carry) = a12.adc(Limb::ZERO, carry); + let (a13, carry) = a13.adc(a7, carry); + let (a14, carry) = a14.mac(a7, modulus[7], carry); + let (a15, a16) = a15.adc(carry2, carry); + + // Result may be within MODULUS of the correct value + let (result, _) = sub_inner( + [a8, a9, a10, a11, a12, a13, a14, a15, a16], + [ + modulus[0], + modulus[1], + modulus[2], + modulus[3], + modulus[4], + modulus[5], + modulus[6], + modulus[7], + Limb::ZERO, + ], + ); + + U256::new([ + result[0], result[1], result[2], result[3], result[4], result[5], result[6], result[7], + ]) +} + +#[inline] +#[allow(clippy::too_many_arguments)] +const fn sub_inner(l: [Limb; 9], r: [Limb; 9]) -> ([Limb; 8], Limb) { + let (w0, borrow) = l[0].sbb(r[0], Limb::ZERO); + let (w1, borrow) = l[1].sbb(r[1], borrow); + let (w2, borrow) = l[2].sbb(r[2], borrow); + let (w3, borrow) = l[3].sbb(r[3], borrow); + let (w4, borrow) = l[4].sbb(r[4], borrow); + let (w5, borrow) = l[5].sbb(r[5], borrow); + let (w6, borrow) = l[6].sbb(r[6], borrow); + let (w7, borrow) = l[7].sbb(r[7], borrow); + let (_, borrow) = l[8].sbb(r[8], borrow); + + // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise + // borrow = 0x000...000. Thus, we use it as a mask to conditionally add + // the modulus. + + let modulus = MODULUS.0.as_limbs(); + + let (w0, carry) = w0.adc(modulus[0].bitand(borrow), Limb::ZERO); + let (w1, carry) = w1.adc(modulus[1].bitand(borrow), carry); + let (w2, carry) = w2.adc(modulus[2].bitand(borrow), carry); + let (w3, carry) = w3.adc(modulus[3].bitand(borrow), carry); + let (w4, carry) = w4.adc(modulus[4].bitand(borrow), carry); + let (w5, carry) = w5.adc(modulus[5].bitand(borrow), carry); + let (w6, carry) = w6.adc(modulus[6].bitand(borrow), carry); + let (w7, _) = w7.adc(modulus[7].bitand(borrow), carry); + + ([w0, w1, w2, w3, w4, w5, w6, w7], borrow) +} diff --git a/p256/src/arithmetic/field/field64.rs b/p256/src/arithmetic/field/field64.rs new file mode 100644 index 00000000..8e235d90 --- /dev/null +++ b/p256/src/arithmetic/field/field64.rs @@ -0,0 +1,197 @@ +//! 64-bit secp256r1 field element algorithms. + +use super::MODULUS; +use elliptic_curve::bigint::{Limb, U256, U512}; + +pub(super) const fn add(a: U256, b: U256) -> U256 { + let a = a.as_limbs(); + let b = b.as_limbs(); + + // Bit 256 of p is set, so addition can result in five words. + let (w0, carry) = a[0].adc(b[0], Limb::ZERO); + let (w1, carry) = a[1].adc(b[1], carry); + let (w2, carry) = a[2].adc(b[2], carry); + let (w3, w4) = a[3].adc(b[3], carry); + // let (w0, carry) = adc(a[0], b[0], 0); + // let (w1, carry) = adc(a[1], b[1], carry); + // let (w2, carry) = adc(a[2], b[2], carry); + // let (w3, w4) = adc(a[3], b[3], carry); + + // Attempt to subtract the modulus, to ensure the result is in the field + let modulus = MODULUS.0.as_limbs(); + + let (result, _) = sub_inner( + [w0, w1, w2, w3, w4], + [modulus[0], modulus[1], modulus[2], modulus[3], Limb::ZERO], + ); + U256::new([result[0], result[1], result[2], result[3]]) +} + +pub(super) const fn sub(a: U256, b: U256) -> U256 { + let a = a.as_limbs(); + let b = b.as_limbs(); + + let (result, _) = sub_inner( + [a[0], a[1], a[2], a[3], Limb::ZERO], + [b[0], b[1], b[2], b[3], Limb::ZERO], + ); + U256::new([result[0], result[1], result[2], result[3]]) +} + +#[inline] +pub(super) const fn to_canonical(a: U256) -> U256 { + montgomery_reduce(a, U256::ZERO) +} + +pub(super) fn from_bytes_wide(a: U512) -> U256 { + let words = a.to_limbs(); + montgomery_reduce( + U256::new([words[4], words[5], words[6], words[7]]), + U256::new([words[0], words[1], words[2], words[3]]), + ) +} + +/// Montgomery Reduction +/// +/// The general algorithm is: +/// ```text +/// A <- input (2n b-limbs) +/// for i in 0..n { +/// k <- A[i] p' mod b +/// A <- A + k p b^i +/// } +/// A <- A / b^n +/// if A >= p { +/// A <- A - p +/// } +/// ``` +/// +/// For secp256r1, with a 64-bit arithmetic, we have the following +/// simplifications: +/// +/// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A. +/// +/// - The first limb of p is 2^64 - 1; multiplications by this limb can be simplified +/// to a shift and subtraction: +/// ```text +/// a_i * (2^64 - 1) = a_i * 2^64 - a_i = (a_i << 64) - a_i +/// ``` +/// However, because `p' = 1`, the first limb of p is multiplied by limb i of the +/// intermediate A and then immediately added to that same limb, so we simply +/// initialize the carry to limb i of the intermediate. +/// +/// - The third limb of p is zero, so we can ignore any multiplications by it and just +/// add the carry. +/// +/// References: +/// - Handbook of Applied Cryptography, Chapter 14 +/// Algorithm 14.32 +/// http://cacr.uwaterloo.ca/hac/about/chap14.pdf +/// +/// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256 +/// Algorithm 7) Montgomery Word-by-Word Reduction +/// https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf +#[inline] +#[allow(clippy::too_many_arguments)] +pub(super) const fn montgomery_reduce(lo: U256, hi: U256) -> U256 { + let lo = lo.as_limbs(); + let hi = hi.as_limbs(); + + let a0 = lo[0]; + let a1 = lo[1]; + let a2 = lo[2]; + let a3 = lo[3]; + let a4 = hi[0]; + let a5 = hi[1]; + let a6 = hi[2]; + let a7 = hi[3]; + + let modulus = MODULUS.0.as_limbs(); + + /* + let (a1, carry) = mac(a1, a0, modulus[1], a0); + let (a2, carry) = adc(a2, 0, carry); + let (a3, carry) = mac(a3, a0, modulus[3], carry); + let (a4, carry2) = adc(a4, 0, carry); + + let (a2, carry) = mac(a2, a1, modulus[1], a1); + let (a3, carry) = adc(a3, 0, carry); + let (a4, carry) = mac(a4, a1, modulus[3], carry); + let (a5, carry2) = adc(a5, carry2, carry); + + let (a3, carry) = mac(a3, a2, modulus[1], a2); + let (a4, carry) = adc(a4, 0, carry); + let (a5, carry) = mac(a5, a2, modulus[3], carry); + let (a6, carry2) = adc(a6, carry2, carry); + + let (a4, carry) = mac(a4, a3, modulus[1], a3); + let (a5, carry) = adc(a5, 0, carry); + let (a6, carry) = mac(a6, a3, modulus[3], carry); + let (a7, a8) = adc(a7, carry2, carry); + */ + + let (a1, carry) = a1.mac(a0, modulus[1], a0); + let (a2, carry) = a2.adc(Limb::ZERO, carry); + let (a3, carry) = a3.mac(a0, modulus[3], carry); + let (a4, carry2) = a4.adc(Limb::ZERO, carry); + + let (a2, carry) = a2.mac(a1, modulus[1], a1); + let (a3, carry) = a3.adc(Limb::ZERO, carry); + let (a4, carry) = a4.mac(a1, modulus[3], carry); + let (a5, carry2) = a5.adc(carry2, carry); + + let (a3, carry) = a3.mac(a2, modulus[1], a2); + let (a4, carry) = a4.adc(Limb::ZERO, carry); + let (a5, carry) = a5.mac(a2, modulus[3], carry); + let (a6, carry2) = a6.adc(carry2, carry); + + let (a4, carry) = a4.mac(a3, modulus[1], a3); + let (a5, carry) = a5.adc(Limb::ZERO, carry); + let (a6, carry) = a6.mac(a3, modulus[3], carry); + let (a7, a8) = a7.adc(carry2, carry); + + // Result may be within MODULUS of the correct value + let (result, _) = sub_inner( + [a4, a5, a6, a7, a8], + [modulus[0], modulus[1], modulus[2], modulus[3], Limb::ZERO], + ); + U256::new([result[0], result[1], result[2], result[3]]) +} + +#[inline] +#[allow(clippy::too_many_arguments)] +const fn sub_inner(l: [Limb; 5], r: [Limb; 5]) -> ([Limb; 4], Limb) { + /* + let (w0, borrow) = sbb(l[0], r[0], 0); + let (w1, borrow) = sbb(l[1], r[1], borrow); + let (w2, borrow) = sbb(l[2], r[2], borrow); + let (w3, borrow) = sbb(l[3], r[3], borrow); + let (_, borrow) = sbb(l[4], r[4], borrow); + */ + + let (w0, borrow) = l[0].sbb(r[0], Limb::ZERO); + let (w1, borrow) = l[1].sbb(r[1], borrow); + let (w2, borrow) = l[2].sbb(r[2], borrow); + let (w3, borrow) = l[3].sbb(r[3], borrow); + let (_, borrow) = l[4].sbb(r[4], borrow); + + // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise + // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the + // modulus. + + let modulus = MODULUS.0.as_limbs(); + + /* + let (w0, carry) = adc(w0, modulus[0] & borrow, 0); + let (w1, carry) = adc(w1, modulus[1] & borrow, carry); + let (w2, carry) = adc(w2, modulus[2] & borrow, carry); + let (w3, _) = adc(w3, modulus[3] & borrow, carry); + */ + + let (w0, carry) = w0.adc(modulus[0].bitand(borrow), Limb::ZERO); + let (w1, carry) = w1.adc(modulus[1].bitand(borrow), carry); + let (w2, carry) = w2.adc(modulus[2].bitand(borrow), carry); + let (w3, _) = w3.adc(modulus[3].bitand(borrow), carry); + + ([w0, w1, w2, w3], borrow) +} diff --git a/p256/src/arithmetic/scalar.rs b/p256/src/arithmetic/scalar.rs index 32cac621..f7de1a2f 100644 --- a/p256/src/arithmetic/scalar.rs +++ b/p256/src/arithmetic/scalar.rs @@ -38,17 +38,6 @@ pub(crate) const MODULUS: U256 = NistP256::ORDER; /// `MODULUS / 2` const FRAC_MODULUS_2: Scalar = Scalar(MODULUS.shr_vartime(1)); -/// MU = floor(2^512 / n) -/// = 115792089264276142090721624801893421302707618245269942344307673200490803338238 -/// = 0x100000000fffffffffffffffeffffffff43190552df1a6c21012ffd85eedf9bfe -pub const MU: [u64; 5] = [ - 0x012f_fd85_eedf_9bfe, - 0x4319_0552_df1a_6c21, - 0xffff_fffe_ffff_ffff, - 0x0000_0000_ffff_ffff, - 0x0000_0000_0000_0001, -]; - /// Scalars are elements in the finite field modulo n. /// /// # Trait impls diff --git a/p256/src/arithmetic/scalar/scalar32.rs b/p256/src/arithmetic/scalar/scalar32.rs index dfa5742e..f13a5892 100644 --- a/p256/src/arithmetic/scalar/scalar32.rs +++ b/p256/src/arithmetic/scalar/scalar32.rs @@ -1,12 +1,22 @@ //! 32-bit secp256r1 scalar field algorithms. -// TODO(tarcieri): adapt 64-bit arithmetic to proper 32-bit arithmetic +use super::MODULUS; +use elliptic_curve::bigint::{Limb, U256}; -use super::{MODULUS, MU}; -use crate::{ - arithmetic::util::{adc, mac, sbb}, - U256, -}; +/// MU = floor(2^512 / n) +/// = 115792089264276142090721624801893421302707618245269942344307673200490803338238 +/// = 0x100000000fffffffffffffffeffffffff43190552df1a6c21012ffd85eedf9bfe +const MU: [Limb; 9] = [ + Limb::from_u32(0xeedf_9bfe), + Limb::from_u32(0x012f_fd85), + Limb::from_u32(0xdf1a_6c21), + Limb::from_u32(0x4319_0552), + Limb::from_u32(0xffff_ffff), + Limb::from_u32(0xffff_fffe), + Limb::from_u32(0xffff_ffff), + Limb::from_u32(0x0000_0000), + Limb::from_u32(0x0000_0001), +]; /// Barrett Reduction /// @@ -39,150 +49,278 @@ use crate::{ #[inline] #[allow(clippy::too_many_arguments)] pub(super) const fn barrett_reduce(lo: U256, hi: U256) -> U256 { - let lo = u256_to_u64x4(lo); - let hi = u256_to_u64x4(hi); + let lo = lo.as_limbs(); + let hi = hi.as_limbs(); + let a0 = lo[0]; let a1 = lo[1]; let a2 = lo[2]; let a3 = lo[3]; - let a4 = hi[0]; - let a5 = hi[1]; - let a6 = hi[2]; - let a7 = hi[3]; - let q1: [u64; 5] = [a3, a4, a5, a6, a7]; - let q3 = q1_times_mu_shift_five(&q1); + let a4 = lo[4]; + let a5 = lo[5]; + let a6 = lo[6]; + let a7 = lo[7]; + let a8 = hi[0]; + let a9 = hi[1]; + let a10 = hi[2]; + let a11 = hi[3]; + let a12 = hi[4]; + let a13 = hi[5]; + let a14 = hi[6]; + let a15 = hi[7]; + + let q1: [Limb; 9] = [a7, a8, a9, a10, a11, a12, a13, a14, a15]; + let q3: [Limb; 9] = q1_times_mu_shift_nine(&q1); - let r1: [u64; 5] = [a0, a1, a2, a3, a4]; - let r2: [u64; 5] = q3_times_n_keep_five(&q3); - let r: [u64; 5] = sub_inner_five(r1, r2); + let r1: [Limb; 9] = [a0, a1, a2, a3, a4, a5, a6, a7, a8]; + let r2: [Limb; 9] = q3_times_n_keep_nine(&q3); + let r: [Limb; 9] = sub_inner_nine(r1, r2); // Result is in range (0, 3*n - 1), // and 90% of the time, no subtraction will be needed. - let r = subtract_n_if_necessary(r[0], r[1], r[2], r[3], r[4]); - let r = subtract_n_if_necessary(r[0], r[1], r[2], r[3], r[4]); - - U256::from_words([ - (r[0] & 0xFFFFFFFF) as u32, - (r[0] >> 32) as u32, - (r[1] & 0xFFFFFFFF) as u32, - (r[1] >> 32) as u32, - (r[2] & 0xFFFFFFFF) as u32, - (r[2] >> 32) as u32, - (r[3] & 0xFFFFFFFF) as u32, - (r[3] >> 32) as u32, - ]) + let r = subtract_n_if_necessary(r); + let r = subtract_n_if_necessary(r); + + U256::new([r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]]) } -const fn q1_times_mu_shift_five(q1: &[u64; 5]) -> [u64; 5] { - // Schoolbook multiplication. - - let (_w0, carry) = mac(0, q1[0], MU[0], 0); - let (w1, carry) = mac(0, q1[0], MU[1], carry); - let (w2, carry) = mac(0, q1[0], MU[2], carry); - let (w3, carry) = mac(0, q1[0], MU[3], carry); - let (w4, w5) = mac(0, q1[0], MU[4], carry); - - let (_w1, carry) = mac(w1, q1[1], MU[0], 0); - let (w2, carry) = mac(w2, q1[1], MU[1], carry); - let (w3, carry) = mac(w3, q1[1], MU[2], carry); - let (w4, carry) = mac(w4, q1[1], MU[3], carry); - let (w5, w6) = mac(w5, q1[1], MU[4], carry); - - let (_w2, carry) = mac(w2, q1[2], MU[0], 0); - let (w3, carry) = mac(w3, q1[2], MU[1], carry); - let (w4, carry) = mac(w4, q1[2], MU[2], carry); - let (w5, carry) = mac(w5, q1[2], MU[3], carry); - let (w6, w7) = mac(w6, q1[2], MU[4], carry); - - let (_w3, carry) = mac(w3, q1[3], MU[0], 0); - let (w4, carry) = mac(w4, q1[3], MU[1], carry); - let (w5, carry) = mac(w5, q1[3], MU[2], carry); - let (w6, carry) = mac(w6, q1[3], MU[3], carry); - let (w7, w8) = mac(w7, q1[3], MU[4], carry); - - let (_w4, carry) = mac(w4, q1[4], MU[0], 0); - let (w5, carry) = mac(w5, q1[4], MU[1], carry); - let (w6, carry) = mac(w6, q1[4], MU[2], carry); - let (w7, carry) = mac(w7, q1[4], MU[3], carry); - let (w8, w9) = mac(w8, q1[4], MU[4], carry); - - // let q2 = [_w0, _w1, _w2, _w3, _w4, w5, w6, w7, w8, w9]; - [w5, w6, w7, w8, w9] +const fn q1_times_mu_shift_nine(q1: &[Limb; 9]) -> [Limb; 9] { + // Schoolbook multiplication + + let (_w0, carry) = Limb::ZERO.mac(q1[0], MU[0], Limb::ZERO); + let (w1, carry) = Limb::ZERO.mac(q1[0], MU[1], carry); + let (w2, carry) = Limb::ZERO.mac(q1[0], MU[2], carry); + let (w3, carry) = Limb::ZERO.mac(q1[0], MU[3], carry); + let (w4, carry) = Limb::ZERO.mac(q1[0], MU[4], carry); + let (w5, carry) = Limb::ZERO.mac(q1[0], MU[5], carry); + let (w6, carry) = Limb::ZERO.mac(q1[0], MU[6], carry); + // NOTE MU[7] == 0 + // let (w7, carry) = Limb::ZERO.mac(q1[0], MU[7], carry); + let (w7, _carry) = (carry, Limb::ZERO); + // NOTE MU[8] == 1 + // let (w8, w9) = Limb::ZERO.mac(q1[0], MU[8], carry); + let (w8, w9) = (q1[0], Limb::ZERO); + + let (_w1, carry) = w1.mac(q1[1], MU[0], Limb::ZERO); + let (w2, carry) = w2.mac(q1[1], MU[1], carry); + let (w3, carry) = w3.mac(q1[1], MU[2], carry); + let (w4, carry) = w4.mac(q1[1], MU[3], carry); + let (w5, carry) = w5.mac(q1[1], MU[4], carry); + let (w6, carry) = w6.mac(q1[1], MU[5], carry); + let (w7, carry) = w7.mac(q1[1], MU[6], carry); + // NOTE MU[7] == 0 + // let (w8, carry) = w8.mac(q1[1], MU[7], carry); + let (w8, carry) = w8.adc(Limb::ZERO, carry); + // NOTE MU[8] == 1 + // let (w9, w10) = w9.mac(q1[1], MU[8], carry); + let (w9, w10) = w9.adc(q1[1], carry); + + let (_w2, carry) = w2.mac(q1[2], MU[0], Limb::ZERO); + let (w3, carry) = w3.mac(q1[2], MU[1], carry); + let (w4, carry) = w4.mac(q1[2], MU[2], carry); + let (w5, carry) = w5.mac(q1[2], MU[3], carry); + let (w6, carry) = w6.mac(q1[2], MU[4], carry); + let (w7, carry) = w7.mac(q1[2], MU[5], carry); + let (w8, carry) = w8.mac(q1[2], MU[6], carry); + // let (w9, carry) = w9.mac(q1[2], MU[7], carry); + let (w9, carry) = w9.adc(Limb::ZERO, carry); + // let (w10, w11) = w10.mac(q1[2], MU[8], carry); + let (w10, w11) = w10.adc(q1[2], carry); + + let (_w3, carry) = w3.mac(q1[3], MU[0], Limb::ZERO); + let (w4, carry) = w4.mac(q1[3], MU[1], carry); + let (w5, carry) = w5.mac(q1[3], MU[2], carry); + let (w6, carry) = w6.mac(q1[3], MU[3], carry); + let (w7, carry) = w7.mac(q1[3], MU[4], carry); + let (w8, carry) = w8.mac(q1[3], MU[5], carry); + let (w9, carry) = w9.mac(q1[3], MU[6], carry); + // let (w10, carry) = w10.mac(q1[3], MU[7], carry); + let (w10, carry) = w10.adc(Limb::ZERO, carry); + // let (w11, w12) = w11.mac(q1[3], MU[8], carry); + let (w11, w12) = w11.adc(q1[3], carry); + + let (_w4, carry) = w4.mac(q1[4], MU[0], Limb::ZERO); + let (w5, carry) = w5.mac(q1[4], MU[1], carry); + let (w6, carry) = w6.mac(q1[4], MU[2], carry); + let (w7, carry) = w7.mac(q1[4], MU[3], carry); + let (w8, carry) = w8.mac(q1[4], MU[4], carry); + let (w9, carry) = w9.mac(q1[4], MU[5], carry); + let (w10, carry) = w10.mac(q1[4], MU[6], carry); + // let (w11, carry) = w11.mac(q1[4], MU[7], carry); + let (w11, carry) = w11.adc(Limb::ZERO, carry); + // let (w12, w13) = w12.mac(q1[4], MU[8], carry); + let (w12, w13) = w12.adc(q1[4], carry); + + let (_w5, carry) = w5.mac(q1[5], MU[0], Limb::ZERO); + let (w6, carry) = w6.mac(q1[5], MU[1], carry); + let (w7, carry) = w7.mac(q1[5], MU[2], carry); + let (w8, carry) = w8.mac(q1[5], MU[3], carry); + let (w9, carry) = w9.mac(q1[5], MU[4], carry); + let (w10, carry) = w10.mac(q1[5], MU[5], carry); + let (w11, carry) = w11.mac(q1[5], MU[6], carry); + // let (w12, carry) = w12.mac(q1[5], MU[7], carry); + let (w12, carry) = w12.adc(Limb::ZERO, carry); + // let (w13, w14) = w13.mac(q1[5], MU[8], carry); + let (w13, w14) = w13.adc(q1[5], carry); + + let (_w6, carry) = w6.mac(q1[6], MU[0], Limb::ZERO); + let (w7, carry) = w7.mac(q1[6], MU[1], carry); + let (w8, carry) = w8.mac(q1[6], MU[2], carry); + let (w9, carry) = w9.mac(q1[6], MU[3], carry); + let (w10, carry) = w10.mac(q1[6], MU[4], carry); + let (w11, carry) = w11.mac(q1[6], MU[5], carry); + let (w12, carry) = w12.mac(q1[6], MU[6], carry); + // let (w13, carry) = w13.mac(q1[6], MU[7], carry); + let (w13, carry) = w13.adc(Limb::ZERO, carry); + // let (w14, w15) = w14.mac(q1[6], MU[8], carry); + let (w14, w15) = w14.adc(q1[6], carry); + + let (_w7, carry) = w7.mac(q1[7], MU[0], Limb::ZERO); + let (w8, carry) = w8.mac(q1[7], MU[1], carry); + let (w9, carry) = w9.mac(q1[7], MU[2], carry); + let (w10, carry) = w10.mac(q1[7], MU[3], carry); + let (w11, carry) = w11.mac(q1[7], MU[4], carry); + let (w12, carry) = w12.mac(q1[7], MU[5], carry); + let (w13, carry) = w13.mac(q1[7], MU[6], carry); + // let (w14, carry) = w14.mac(q1[7], MU[7], carry); + let (w14, carry) = w14.adc(Limb::ZERO, carry); + // let (w15, w16) = w15.mac(q1[7], MU[8], carry); + let (w15, w16) = w15.adc(q1[7], carry); + + let (_w8, carry) = w8.mac(q1[8], MU[0], Limb::ZERO); + let (w9, carry) = w9.mac(q1[8], MU[1], carry); + let (w10, carry) = w10.mac(q1[8], MU[2], carry); + let (w11, carry) = w11.mac(q1[8], MU[3], carry); + let (w12, carry) = w12.mac(q1[8], MU[4], carry); + let (w13, carry) = w13.mac(q1[8], MU[5], carry); + let (w14, carry) = w14.mac(q1[8], MU[6], carry); + // let (w15, carry) = w15.mac(w15, q1[8], MU[7], carry); + let (w15, carry) = w15.adc(Limb::ZERO, carry); + // let (w16, w17) = w16.mac(w16, q1[8], MU[8], carry); + let (w16, w17) = w16.adc(q1[8], carry); + + // let q2 = [_w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7, _w8, w9, w10, w11, w12, w13, w14, w15, w16, w17]; + [w9, w10, w11, w12, w13, w14, w15, w16, w17] } -const fn q3_times_n_keep_five(q3: &[u64; 5]) -> [u64; 5] { - // Schoolbook multiplication. +const fn q3_times_n_keep_nine(q3: &[Limb; 9]) -> [Limb; 9] { + // Schoolbook multiplication + + let modulus = MODULUS.as_limbs(); + + /* NOTE + * modulus[7] = 2^32 - 1 + * modulus[6] = 0 + * modulus[5] = 2^32 - 1 + * modulus[4] = 2^32 - 1 + */ + + let (w0, carry) = Limb::ZERO.mac(q3[0], modulus[0], Limb::ZERO); + let (w1, carry) = Limb::ZERO.mac(q3[0], modulus[1], carry); + let (w2, carry) = Limb::ZERO.mac(q3[0], modulus[2], carry); + let (w3, carry) = Limb::ZERO.mac(q3[0], modulus[3], carry); + let (w4, carry) = Limb::ZERO.mac(q3[0], modulus[4], carry); + let (w5, carry) = Limb::ZERO.mac(q3[0], modulus[5], carry); + // NOTE modulus[6] = 0 + // let (w6, carry) = Limb::ZERO.mac(q3[0], modulus[6], carry); + let (w6, carry) = (carry, Limb::ZERO); + let (w7, carry) = Limb::ZERO.mac(q3[0], modulus[7], carry); + // let (w8, _) = Limb::ZERO.mac(q3[0], Limb::ZERO, carry); + let (w8, _) = (carry, Limb::ZERO); + + let (w1, carry) = w1.mac(q3[1], modulus[0], Limb::ZERO); + let (w2, carry) = w2.mac(q3[1], modulus[1], carry); + let (w3, carry) = w3.mac(q3[1], modulus[2], carry); + let (w4, carry) = w4.mac(q3[1], modulus[3], carry); + let (w5, carry) = w5.mac(q3[1], modulus[4], carry); + let (w6, carry) = w6.mac(q3[1], modulus[5], carry); + // let (w7, carry) = w7.mac(q3[1], modulus[6], carry); + let (w7, carry) = w7.adc(Limb::ZERO, carry); + let (w8, _) = w8.mac(q3[1], modulus[7], carry); + + let (w2, carry) = w2.mac(q3[2], modulus[0], Limb::ZERO); + let (w3, carry) = w3.mac(q3[2], modulus[1], carry); + let (w4, carry) = w4.mac(q3[2], modulus[2], carry); + let (w5, carry) = w5.mac(q3[2], modulus[3], carry); + let (w6, carry) = w6.mac(q3[2], modulus[4], carry); + let (w7, carry) = w7.mac(q3[2], modulus[5], carry); + // let (w8, _) = w8.mac(q3[2], modulus[6], carry); + let (w8, _) = w8.adc(Limb::ZERO, carry); - let modulus = u256_to_u64x4(MODULUS); + let (w3, carry) = w3.mac(q3[3], modulus[0], Limb::ZERO); + let (w4, carry) = w4.mac(q3[3], modulus[1], carry); + let (w5, carry) = w5.mac(q3[3], modulus[2], carry); + let (w6, carry) = w6.mac(q3[3], modulus[3], carry); + let (w7, carry) = w7.mac(q3[3], modulus[4], carry); + let (w8, _) = w8.mac(q3[3], modulus[5], carry); - let (w0, carry) = mac(0, q3[0], modulus[0], 0); - let (w1, carry) = mac(0, q3[0], modulus[1], carry); - let (w2, carry) = mac(0, q3[0], modulus[2], carry); - let (w3, carry) = mac(0, q3[0], modulus[3], carry); - let (w4, _) = mac(0, q3[0], 0, carry); + let (w4, carry) = w4.mac(q3[4], modulus[0], Limb::ZERO); + let (w5, carry) = w5.mac(q3[4], modulus[1], carry); + let (w6, carry) = w6.mac(q3[4], modulus[2], carry); + let (w7, carry) = w7.mac(q3[4], modulus[3], carry); + let (w8, _) = w8.mac(q3[4], modulus[4], carry); - let (w1, carry) = mac(w1, q3[1], modulus[0], 0); - let (w2, carry) = mac(w2, q3[1], modulus[1], carry); - let (w3, carry) = mac(w3, q3[1], modulus[2], carry); - let (w4, _) = mac(w4, q3[1], modulus[3], carry); + let (w5, carry) = w5.mac(q3[5], modulus[0], Limb::ZERO); + let (w6, carry) = w6.mac(q3[5], modulus[1], carry); + let (w7, carry) = w7.mac(q3[5], modulus[2], carry); + let (w8, _) = w8.mac(q3[5], modulus[3], carry); - let (w2, carry) = mac(w2, q3[2], modulus[0], 0); - let (w3, carry) = mac(w3, q3[2], modulus[1], carry); - let (w4, _) = mac(w4, q3[2], modulus[2], carry); + let (w6, carry) = w6.mac(q3[6], modulus[0], Limb::ZERO); + let (w7, carry) = w7.mac(q3[6], modulus[1], carry); + let (w8, _) = w8.mac(q3[6], modulus[2], carry); - let (w3, carry) = mac(w3, q3[3], modulus[0], 0); - let (w4, _) = mac(w4, q3[3], modulus[1], carry); + let (w7, carry) = w7.mac(q3[7], modulus[0], Limb::ZERO); + let (w8, _) = w8.mac(q3[7], modulus[1], carry); - let (w4, _) = mac(w4, q3[4], modulus[0], 0); + let (w8, _) = w8.mac(q3[8], modulus[0], Limb::ZERO); - [w0, w1, w2, w3, w4] + [w0, w1, w2, w3, w4, w5, w6, w7, w8] } #[inline] #[allow(clippy::too_many_arguments)] -const fn sub_inner_five(l: [u64; 5], r: [u64; 5]) -> [u64; 5] { - let (w0, borrow) = sbb(l[0], r[0], 0); - let (w1, borrow) = sbb(l[1], r[1], borrow); - let (w2, borrow) = sbb(l[2], r[2], borrow); - let (w3, borrow) = sbb(l[3], r[3], borrow); - let (w4, _borrow) = sbb(l[4], r[4], borrow); - - // If underflow occurred on the final limb - don't care (= add b^{k+1}). - [w0, w1, w2, w3, w4] +const fn sub_inner_nine(l: [Limb; 9], r: [Limb; 9]) -> [Limb; 9] { + let (w0, borrow) = l[0].sbb(r[0], Limb::ZERO); + let (w1, borrow) = l[1].sbb(r[1], borrow); + let (w2, borrow) = l[2].sbb(r[2], borrow); + let (w3, borrow) = l[3].sbb(r[3], borrow); + let (w4, borrow) = l[4].sbb(r[4], borrow); + let (w5, borrow) = l[5].sbb(r[5], borrow); + let (w6, borrow) = l[6].sbb(r[6], borrow); + let (w7, borrow) = l[7].sbb(r[7], borrow); + let (w8, _borrow) = l[8].sbb(r[8], borrow); + + // If underflow occured in the final limb - don't care (= add b^{k+1}). + [w0, w1, w2, w3, w4, w5, w6, w7, w8] } #[inline] #[allow(clippy::too_many_arguments)] -const fn subtract_n_if_necessary(r0: u64, r1: u64, r2: u64, r3: u64, r4: u64) -> [u64; 5] { - let modulus = u256_to_u64x4(MODULUS); - - let (w0, borrow) = sbb(r0, modulus[0], 0); - let (w1, borrow) = sbb(r1, modulus[1], borrow); - let (w2, borrow) = sbb(r2, modulus[2], borrow); - let (w3, borrow) = sbb(r3, modulus[3], borrow); - let (w4, borrow) = sbb(r4, 0, borrow); - - // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise - // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the - // modulus. - let (w0, carry) = adc(w0, modulus[0] & borrow, 0); - let (w1, carry) = adc(w1, modulus[1] & borrow, carry); - let (w2, carry) = adc(w2, modulus[2] & borrow, carry); - let (w3, carry) = adc(w3, modulus[3] & borrow, carry); - let (w4, _carry) = adc(w4, 0, carry); - - [w0, w1, w2, w3, w4] -} +const fn subtract_n_if_necessary(r: [Limb; 9]) -> [Limb; 9] { + let modulus = MODULUS.as_limbs(); -// TODO(tarcieri): replace this with proper 32-bit arithmetic -#[inline] -const fn u256_to_u64x4(u256: U256) -> [u64; 4] { - let words = u256.as_words(); - - [ - (words[0] as u64) | ((words[1] as u64) << 32), - (words[2] as u64) | ((words[3] as u64) << 32), - (words[4] as u64) | ((words[5] as u64) << 32), - (words[6] as u64) | ((words[7] as u64) << 32), - ] + let (w0, borrow) = r[0].sbb(modulus[0], Limb::ZERO); + let (w1, borrow) = r[1].sbb(modulus[1], borrow); + let (w2, borrow) = r[2].sbb(modulus[2], borrow); + let (w3, borrow) = r[3].sbb(modulus[3], borrow); + let (w4, borrow) = r[4].sbb(modulus[4], borrow); + let (w5, borrow) = r[5].sbb(modulus[5], borrow); + let (w6, borrow) = r[6].sbb(modulus[6], borrow); + let (w7, borrow) = r[7].sbb(modulus[7], borrow); + let (w8, borrow) = r[8].sbb(Limb::ZERO, borrow); + + // If underflow occurred in the final limb, borrow = 0xfff...fff, otherwise + // borrow = 0x000...000. Thus, we use it as a mask to conditionally add + // the modulus. + let (w0, carry) = w0.adc(modulus[0].bitand(borrow), Limb::ZERO); + let (w1, carry) = w1.adc(modulus[1].bitand(borrow), carry); + let (w2, carry) = w2.adc(modulus[2].bitand(borrow), carry); + let (w3, carry) = w3.adc(modulus[3].bitand(borrow), carry); + let (w4, carry) = w4.adc(modulus[4].bitand(borrow), carry); + let (w5, carry) = w5.adc(modulus[5].bitand(borrow), carry); + let (w6, carry) = w6.adc(modulus[6].bitand(borrow), carry); + let (w7, carry) = w7.adc(modulus[7].bitand(borrow), carry); + let (w8, _carry) = w8.adc(Limb::ZERO, carry); + + [w0, w1, w2, w3, w4, w5, w6, w7, w8] } diff --git a/p256/src/arithmetic/scalar/scalar64.rs b/p256/src/arithmetic/scalar/scalar64.rs index e15711bc..2bad5612 100644 --- a/p256/src/arithmetic/scalar/scalar64.rs +++ b/p256/src/arithmetic/scalar/scalar64.rs @@ -1,10 +1,18 @@ //! 64-bit secp256r1 scalar field algorithms. -use super::{MODULUS, MU}; -use crate::{ - arithmetic::util::{adc, mac, sbb}, - U256, -}; +use super::MODULUS; +use elliptic_curve::bigint::{Limb, U256}; + +/// MU = floor(2^512 / n) +/// = 115792089264276142090721624801893421302707618245269942344307673200490803338238 +/// = 0x100000000fffffffffffffffeffffffff43190552df1a6c21012ffd85eedf9bfe +const MU: [Limb; 5] = [ + Limb::from_u64(0x012f_fd85_eedf_9bfe), + Limb::from_u64(0x4319_0552_df1a_6c21), + Limb::from_u64(0xffff_fffe_ffff_ffff), + Limb::from_u64(0x0000_0000_ffff_ffff), + Limb::from_u64(0x0000_0000_0000_0001), +]; /// Barrett Reduction /// @@ -37,8 +45,8 @@ use crate::{ #[inline] #[allow(clippy::too_many_arguments)] pub(super) const fn barrett_reduce(lo: U256, hi: U256) -> U256 { - let lo = lo.as_words(); - let hi = hi.as_words(); + let lo = lo.as_limbs(); + let hi = hi.as_limbs(); let a0 = lo[0]; let a1 = lo[1]; let a2 = lo[2]; @@ -47,93 +55,100 @@ pub(super) const fn barrett_reduce(lo: U256, hi: U256) -> U256 { let a5 = hi[1]; let a6 = hi[2]; let a7 = hi[3]; - let q1: [u64; 5] = [a3, a4, a5, a6, a7]; + let q1 = [a3, a4, a5, a6, a7]; let q3 = q1_times_mu_shift_five(&q1); - let r1: [u64; 5] = [a0, a1, a2, a3, a4]; - let r2: [u64; 5] = q3_times_n_keep_five(&q3); - let r: [u64; 5] = sub_inner_five(r1, r2); + let r1 = [a0, a1, a2, a3, a4]; + let r2 = q3_times_n_keep_five(&q3); + let r = sub_inner_five(r1, r2); // Result is in range (0, 3*n - 1), // and 90% of the time, no subtraction will be needed. - let r = subtract_n_if_necessary(r[0], r[1], r[2], r[3], r[4]); - let r = subtract_n_if_necessary(r[0], r[1], r[2], r[3], r[4]); - U256::from_words([r[0], r[1], r[2], r[3]]) + let r = subtract_n_if_necessary(r); + let r = subtract_n_if_necessary(r); + U256::new([r[0], r[1], r[2], r[3]]) } -const fn q1_times_mu_shift_five(q1: &[u64; 5]) -> [u64; 5] { - // Schoolbook multiplication. - - let (_w0, carry) = mac(0, q1[0], MU[0], 0); - let (w1, carry) = mac(0, q1[0], MU[1], carry); - let (w2, carry) = mac(0, q1[0], MU[2], carry); - let (w3, carry) = mac(0, q1[0], MU[3], carry); - let (w4, w5) = mac(0, q1[0], MU[4], carry); - - let (_w1, carry) = mac(w1, q1[1], MU[0], 0); - let (w2, carry) = mac(w2, q1[1], MU[1], carry); - let (w3, carry) = mac(w3, q1[1], MU[2], carry); - let (w4, carry) = mac(w4, q1[1], MU[3], carry); - let (w5, w6) = mac(w5, q1[1], MU[4], carry); - - let (_w2, carry) = mac(w2, q1[2], MU[0], 0); - let (w3, carry) = mac(w3, q1[2], MU[1], carry); - let (w4, carry) = mac(w4, q1[2], MU[2], carry); - let (w5, carry) = mac(w5, q1[2], MU[3], carry); - let (w6, w7) = mac(w6, q1[2], MU[4], carry); - - let (_w3, carry) = mac(w3, q1[3], MU[0], 0); - let (w4, carry) = mac(w4, q1[3], MU[1], carry); - let (w5, carry) = mac(w5, q1[3], MU[2], carry); - let (w6, carry) = mac(w6, q1[3], MU[3], carry); - let (w7, w8) = mac(w7, q1[3], MU[4], carry); - - let (_w4, carry) = mac(w4, q1[4], MU[0], 0); - let (w5, carry) = mac(w5, q1[4], MU[1], carry); - let (w6, carry) = mac(w6, q1[4], MU[2], carry); - let (w7, carry) = mac(w7, q1[4], MU[3], carry); - let (w8, w9) = mac(w8, q1[4], MU[4], carry); +const fn q1_times_mu_shift_five(q1: &[Limb; 5]) -> [Limb; 5] { + // Schoolbook multiplication + + let (_w0, carry) = Limb::ZERO.mac(q1[0], MU[0], Limb::ZERO); + let (w1, carry) = Limb::ZERO.mac(q1[0], MU[1], carry); + let (w2, carry) = Limb::ZERO.mac(q1[0], MU[2], carry); + let (w3, carry) = Limb::ZERO.mac(q1[0], MU[3], carry); + // NOTE MU[4] == 1 + // let (w4, w5) = Limb::ZERO.mac(q1[0], MU[4], carry); + let (w4, w5) = Limb::ZERO.adc(q1[0], carry); + + let (_w1, carry) = w1.mac(q1[1], MU[0], Limb::ZERO); + let (w2, carry) = w2.mac(q1[1], MU[1], carry); + let (w3, carry) = w3.mac(q1[1], MU[2], carry); + let (w4, carry) = w4.mac(q1[1], MU[3], carry); + // let (w5, w6) = mac(w5, q1[1], MU[4], carry); + let (w5, w6) = w5.adc(q1[1], carry); + + let (_w2, carry) = w2.mac(q1[2], MU[0], Limb::ZERO); + let (w3, carry) = w3.mac(q1[2], MU[1], carry); + let (w4, carry) = w4.mac(q1[2], MU[2], carry); + let (w5, carry) = w5.mac(q1[2], MU[3], carry); + // let (w6, w7) = w6.mac(q1[2], MU[4], carry); + let (w6, w7) = w6.adc(q1[2], carry); + + let (_w3, carry) = w3.mac(q1[3], MU[0], Limb::ZERO); + let (w4, carry) = w4.mac(q1[3], MU[1], carry); + let (w5, carry) = w5.mac(q1[3], MU[2], carry); + let (w6, carry) = w6.mac(q1[3], MU[3], carry); + // let (w7, w8) = w7.mac(q1[3], MU[4], carry); + let (w7, w8) = w7.adc(q1[3], carry); + + let (_w4, carry) = w4.mac(q1[4], MU[0], Limb::ZERO); + let (w5, carry) = w5.mac(q1[4], MU[1], carry); + let (w6, carry) = w6.mac(q1[4], MU[2], carry); + let (w7, carry) = w7.mac(q1[4], MU[3], carry); + // let (w8, w9) = w8.mac(q1[4], MU[4], carry); + let (w8, w9) = w8.adc(q1[4], carry); // let q2 = [_w0, _w1, _w2, _w3, _w4, w5, w6, w7, w8, w9]; [w5, w6, w7, w8, w9] } -const fn q3_times_n_keep_five(q3: &[u64; 5]) -> [u64; 5] { +const fn q3_times_n_keep_five(q3: &[Limb; 5]) -> [Limb; 5] { // Schoolbook multiplication. - let modulus = MODULUS.as_words(); + let modulus = MODULUS.as_limbs(); - let (w0, carry) = mac(0, q3[0], modulus[0], 0); - let (w1, carry) = mac(0, q3[0], modulus[1], carry); - let (w2, carry) = mac(0, q3[0], modulus[2], carry); - let (w3, carry) = mac(0, q3[0], modulus[3], carry); - let (w4, _) = mac(0, q3[0], 0, carry); + let (w0, carry) = Limb::ZERO.mac(q3[0], modulus[0], Limb::ZERO); + let (w1, carry) = Limb::ZERO.mac(q3[0], modulus[1], carry); + let (w2, carry) = Limb::ZERO.mac(q3[0], modulus[2], carry); + let (w3, carry) = Limb::ZERO.mac(q3[0], modulus[3], carry); + // let (w4, _) = Limb::ZERO.mac(q3[0], 0, carry); + let (w4, _) = (carry, Limb::ZERO); - let (w1, carry) = mac(w1, q3[1], modulus[0], 0); - let (w2, carry) = mac(w2, q3[1], modulus[1], carry); - let (w3, carry) = mac(w3, q3[1], modulus[2], carry); - let (w4, _) = mac(w4, q3[1], modulus[3], carry); + let (w1, carry) = w1.mac(q3[1], modulus[0], Limb::ZERO); + let (w2, carry) = w2.mac(q3[1], modulus[1], carry); + let (w3, carry) = w3.mac(q3[1], modulus[2], carry); + let (w4, _) = w4.mac(q3[1], modulus[3], carry); - let (w2, carry) = mac(w2, q3[2], modulus[0], 0); - let (w3, carry) = mac(w3, q3[2], modulus[1], carry); - let (w4, _) = mac(w4, q3[2], modulus[2], carry); + let (w2, carry) = w2.mac(q3[2], modulus[0], Limb::ZERO); + let (w3, carry) = w3.mac(q3[2], modulus[1], carry); + let (w4, _) = w4.mac(q3[2], modulus[2], carry); - let (w3, carry) = mac(w3, q3[3], modulus[0], 0); - let (w4, _) = mac(w4, q3[3], modulus[1], carry); + let (w3, carry) = w3.mac(q3[3], modulus[0], Limb::ZERO); + let (w4, _) = w4.mac(q3[3], modulus[1], carry); - let (w4, _) = mac(w4, q3[4], modulus[0], 0); + let (w4, _) = w4.mac(q3[4], modulus[0], Limb::ZERO); [w0, w1, w2, w3, w4] } #[inline] #[allow(clippy::too_many_arguments)] -const fn sub_inner_five(l: [u64; 5], r: [u64; 5]) -> [u64; 5] { - let (w0, borrow) = sbb(l[0], r[0], 0); - let (w1, borrow) = sbb(l[1], r[1], borrow); - let (w2, borrow) = sbb(l[2], r[2], borrow); - let (w3, borrow) = sbb(l[3], r[3], borrow); - let (w4, _borrow) = sbb(l[4], r[4], borrow); +const fn sub_inner_five(l: [Limb; 5], r: [Limb; 5]) -> [Limb; 5] { + let (w0, borrow) = l[0].sbb(r[0], Limb::ZERO); + let (w1, borrow) = l[1].sbb(r[1], borrow); + let (w2, borrow) = l[2].sbb(r[2], borrow); + let (w3, borrow) = l[3].sbb(r[3], borrow); + let (w4, _borrow) = l[4].sbb(r[4], borrow); // If underflow occurred on the final limb - don't care (= add b^{k+1}). [w0, w1, w2, w3, w4] @@ -141,23 +156,23 @@ const fn sub_inner_five(l: [u64; 5], r: [u64; 5]) -> [u64; 5] { #[inline] #[allow(clippy::too_many_arguments)] -const fn subtract_n_if_necessary(r0: u64, r1: u64, r2: u64, r3: u64, r4: u64) -> [u64; 5] { - let modulus = MODULUS.as_words(); +const fn subtract_n_if_necessary(r: [Limb; 5]) -> [Limb; 5] { + let modulus = MODULUS.as_limbs(); - let (w0, borrow) = sbb(r0, modulus[0], 0); - let (w1, borrow) = sbb(r1, modulus[1], borrow); - let (w2, borrow) = sbb(r2, modulus[2], borrow); - let (w3, borrow) = sbb(r3, modulus[3], borrow); - let (w4, borrow) = sbb(r4, 0, borrow); + let (w0, borrow) = r[0].sbb(modulus[0], Limb::ZERO); + let (w1, borrow) = r[1].sbb(modulus[1], borrow); + let (w2, borrow) = r[2].sbb(modulus[2], borrow); + let (w3, borrow) = r[3].sbb(modulus[3], borrow); + let (w4, borrow) = r[4].sbb(Limb::ZERO, borrow); // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the // modulus. - let (w0, carry) = adc(w0, modulus[0] & borrow, 0); - let (w1, carry) = adc(w1, modulus[1] & borrow, carry); - let (w2, carry) = adc(w2, modulus[2] & borrow, carry); - let (w3, carry) = adc(w3, modulus[3] & borrow, carry); - let (w4, _carry) = adc(w4, 0, carry); + let (w0, carry) = w0.adc(modulus[0].bitand(borrow), Limb::ZERO); + let (w1, carry) = w1.adc(modulus[1].bitand(borrow), carry); + let (w2, carry) = w2.adc(modulus[2].bitand(borrow), carry); + let (w3, carry) = w3.adc(modulus[3].bitand(borrow), carry); + let (w4, _carry) = w4.adc(Limb::ZERO, carry); [w0, w1, w2, w3, w4] } diff --git a/p256/src/arithmetic/util.rs b/p256/src/arithmetic/util.rs deleted file mode 100644 index 8ce5a9db..00000000 --- a/p256/src/arithmetic/util.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! Helper functions. -// TODO(tarcieri): replace these with `crypto-bigint` - -use elliptic_curve::bigint::U256; - -/// Computes `a + b + carry`, returning the result along with the new carry. 64-bit version. -#[inline(always)] -pub(crate) const fn adc(a: u64, b: u64, carry: u64) -> (u64, u64) { - let ret = (a as u128) + (b as u128) + (carry as u128); - (ret as u64, (ret >> 64) as u64) -} - -/// Computes `a - (b + borrow)`, returning the result along with the new borrow. 64-bit version. -#[inline(always)] -pub(crate) const fn sbb(a: u64, b: u64, borrow: u64) -> (u64, u64) { - let ret = (a as u128).wrapping_sub((b as u128) + ((borrow >> 63) as u128)); - (ret as u64, (ret >> 64) as u64) -} - -/// Computes `a + (b * c) + carry`, returning the result along with the new carry. -#[inline(always)] -pub(crate) const fn mac(a: u64, b: u64, c: u64, carry: u64) -> (u64, u64) { - let ret = (a as u128) + ((b as u128) * (c as u128)) + (carry as u128); - (ret as u64, (ret >> 64) as u64) -} - -/// Array containing 4 x 64-bit unsigned integers. -// TODO(tarcieri): replace this entirely with `U256` -pub(crate) type U64x4 = [u64; 4]; - -/// Convert to a [`U64x4`] array. -// TODO(tarcieri): implement all algorithms in terms of `U256`? -#[cfg(target_pointer_width = "32")] -pub(crate) const fn u256_to_u64x4(u256: U256) -> U64x4 { - let limbs = u256.to_words(); - - [ - (limbs[0] as u64) | ((limbs[1] as u64) << 32), - (limbs[2] as u64) | ((limbs[3] as u64) << 32), - (limbs[4] as u64) | ((limbs[5] as u64) << 32), - (limbs[6] as u64) | ((limbs[7] as u64) << 32), - ] -} - -/// Convert to a [`U64x4`] array. -// TODO(tarcieri): implement all algorithms in terms of `U256`? -#[cfg(target_pointer_width = "64")] -pub(crate) const fn u256_to_u64x4(u256: U256) -> U64x4 { - u256.to_words() -} - -/// Convert from a [`U64x4`] array. -#[cfg(target_pointer_width = "32")] -pub(crate) const fn u64x4_to_u256(limbs: U64x4) -> U256 { - U256::from_words([ - (limbs[0] & 0xFFFFFFFF) as u32, - (limbs[0] >> 32) as u32, - (limbs[1] & 0xFFFFFFFF) as u32, - (limbs[1] >> 32) as u32, - (limbs[2] & 0xFFFFFFFF) as u32, - (limbs[2] >> 32) as u32, - (limbs[3] & 0xFFFFFFFF) as u32, - (limbs[3] >> 32) as u32, - ]) -} - -/// Convert from a [`U64x4`] array. -// TODO(tarcieri): implement all algorithms in terms of `U256`? -#[cfg(target_pointer_width = "64")] -pub(crate) const fn u64x4_to_u256(limbs: U64x4) -> U256 { - U256::from_words(limbs) -}