From de87739635ff278171ed0304912c314719cac820 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 12 Jun 2020 18:37:02 +0200
Subject: [PATCH 01/26] Implement types and uint128 primitives

---
 stint/private/conversion.nim                  |  59 -----
 stint/private/datatypes.nim                   | 203 +++---------------
 .../private/primitives/addcarry_subborrow.nim | 169 +++++++++++++++
 .../private/primitives/extended_precision.nim | 130 +++++++++++
 .../extended_precision_64bit_uint128.nim      |  95 ++++++++
 .../extended_precision_x86_64_gcc.nim         |  57 +++++
 .../extended_precision_x86_64_msvc.nim        |  87 ++++++++
 7 files changed, 572 insertions(+), 228 deletions(-)
 delete mode 100644 stint/private/conversion.nim
 create mode 100644 stint/private/primitives/addcarry_subborrow.nim
 create mode 100644 stint/private/primitives/extended_precision.nim
 create mode 100644 stint/private/primitives/extended_precision_64bit_uint128.nim
 create mode 100644 stint/private/primitives/extended_precision_x86_64_gcc.nim
 create mode 100644 stint/private/primitives/extended_precision_x86_64_msvc.nim

diff --git a/stint/private/conversion.nim b/stint/private/conversion.nim
deleted file mode 100644
index 469e1b1..0000000
--- a/stint/private/conversion.nim
+++ /dev/null
@@ -1,59 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes
-
-func toSubtype*[T: SomeInteger](b: bool, _: typedesc[T]): T {.inline.}=
-  b.T
-
-func toSubtype*[T: UintImpl | IntImpl](b: bool, _: typedesc[T]): T {.inline.}=
-  type SubTy = type result.lo
-  result.lo = toSubtype(b, SubTy)
-
-func toUint*(n: UintImpl or IntImpl or SomeSignedInt): auto {.inline.}=
-  ## Casts an unsigned integer to an uint of the same size
-  # TODO: uint128 support
-  when n.sizeof > 8:
-    {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
-  elif n.sizeof == 8:
-    cast[uint64](n)
-  elif n.sizeof == 4:
-    cast[uint32](n)
-  elif n.sizeof == 2:
-    cast[uint16](n)
-  else:
-    cast[uint8](n)
-
-func toUint*(n: SomeUnsignedInt): SomeUnsignedInt {.inline.}=
-  ## No-op overload of multi-precision int casting
-  n
-
-func asDoubleUint*(n: UintImpl | SomeUnsignedInt): auto {.inline.} =
-  ## Convert an integer or StUint to an uint with double the size
-  type Double = (
-    when n.sizeof == 4: uint64
-    elif n.sizeof == 2: uint32
-    else: uint16
-  )
-
-  n.toUint.Double
-
-func toInt*(n: UintImpl or IntImpl or SomeInteger): auto {.inline.}=
-  ## Casts an unsigned integer to an uint of the same size
-  # TODO: uint128 support
-  when n.sizeof > 8:
-    {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
-  elif n.sizeof == 8:
-    cast[int64](n)
-  elif n.sizeof == 4:
-    cast[int32](n)
-  elif n.sizeof == 2:
-    cast[int16](n)
-  else:
-    cast[int8](n)
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 867683b..46da4be 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -7,191 +7,56 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-# TODO: test if GCC/Clang support uint128 natively
-
-# #### Overview
-#
-# Stint extends the default uint8, uint16, uint32, uint64 with power of 2 integers.
-# Only limitation is your stack size so you can have uint128, uint256, uint512 ...
-# Signed int are also possible.
-#
-# As a high-level API, Stint adheres to Nim and C conventions and uses the same operators like:
-# `+`, `xor`, `not` ...
-#
-# #### Implementation
-#
-# Stint types are stored on the stack and have a structure
-# similar to a binary tree of power of two unsigned integers
-# with "high" and "low" words:
-#
-#                              Stuint[256]
-#            hi: Stuint[128]                  lo: Stuint[128]
-#     hihi: uint64    hilo: uint64    lohi: uint64    lolo: uint64
-#
-# This follows paper https://hal.archives-ouvertes.fr/hal-00582593v2
-# "Recursive double-size fixed precision arithmetic" from Jul. 2016
-# to implement an efficient fixed precision bigint for embedded devices, especially FPGAs.
-#
-# For testing purpose, the flag `-d:stint_test` can be passed at compile-time
-# to switch the backend to uint32.
-# In the future the default backend will become uint128 on supporting compilers.
-#
-# This has following benefits:
-#   - BigEndian/LittleEndian support is trivial.
-#   - Not having for loops help the compiler producing the most efficient instructions
-#     like ADC (Add with Carry)
-#   - Proving that the recursive structure works at depth 64 for uint32 backend means that
-#     it would work at depth 128 for uint64 backend.
-#     We can easily choose a uint16 or uint8 backend as well.
-#   - Due to the recursive structure, testing operations when there is:
-#       - no leaves(uint64)
-#       - a root and leaves with no nodes (uint128)
-#       - a root + intermediate nodes + leaves (uint256)
-#     should be enough to ensure they work at all sizes, edge cases included.
-#   - Adding a new backend like uint128 (GCC/Clang) or uint256 (LLVM instrinsics only) is just adding
-#     a new case in the `uintImpl` template.
-#   - All math implementations of the operations have a straightforward translation
-#     to a high-low structure, including the fastest Karatsuba multiplication
-#     and co-recursive division algorithm by Burnikel and Ziegler.
-#     This makes translating those algorithms into Nim easier compared to an array backend.
-#     It would also probably require less code and would be much easier to audit versus
-#     the math reference papers.
-#   - For implementation of algorithms, there is no issue to take subslices of the memory representation
-#     with a recursive tree structure.
-#     On the other side, returning a `var array[N div 2, uint64]` is problematic at the moment.
-#   - Compile-time computation is possible while due to the previous issue
-#     an array backend would be required to use var openArray[uint64]
-#     i.e. pointers.
-#   - Note that while shift-right and left can easily be done an array of bytes
-#     this would have reduced performance compared to moving 64-bit words.
-#     An efficient implementation on array of words would require checking the shift
-#     versus a half-word to deal with carry-in/out from and to the adjacent words
-#     similar to a recursive implementation.
-#
-# Iterations over the whole integers, for example for `==` is always unrolled.
-# Due to being on the stack, any optimizing compiler should compile that to efficient memcmp
-#
-# When not to use Stint:
-#
-# 1. Constant-time arithmetics
-#    - Do not use Stint if you need side-channels resistance,
-#      This requires to avoid all branches (i.e. no booleans)
-# 2. Arbitrary-precision with lots of small-values
-#    - If you need arbitrary precision but most of the time you store mall values
-#      you will waste a lot of memory unless you use an object variant of various Stint sizes.
-#      type MyUint = object
-#        case kind: int
-#        of 0..64: uint64
-#        of 66..128: ref Stuint[128]
-#        of 129..256: ref Stuint[256]
-#        ...
-#
-# Note: if you work with huge size, you can allocate stints on the heap with
-#       for example `type HeapInt8192 = ref Stint[8192].
-#       If you have a lot of computations and intermediate variables it's probably worthwhile
-#       to explore creating an object pool to reuse the memory buffers.
-
-template checkDiv2(bits: static[int]): untyped =
-  # TODO: There is no need to check if power of 2 at each uintImpl instantiation, it slows compilation.
-  #       However we easily get into nested templates instantiation if we use another
-  #       template that first checks power-of-two and then calls the recursive uintImpl
-  static:
-    doAssert (bits and (bits-1)) == 0, $bits & " is not a power of 2"
-    doAssert bits >= 8, "The number of bits in a should be greater or equal to 8"
-  bits div 2
-
-when defined(mpint_test): # TODO stint_test
-  template uintImpl*(bits: static[int]): untyped =
-    # Test version, StUint[64] = 2 uint32. Test the logic of the library
-
-    when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
-    elif bits == 64: UintImpl[uint32]
-    elif bits == 32: UintImpl[uint16]
-    elif bits == 16: UintImpl[uint8]
-    else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
-
-  template intImpl*(bits: static[int]): untyped =
-    # Test version, StInt[64] = 2 uint32. Test the logic of the library
-    # int is implemented using a signed hi part and an unsigned lo part, given
-    # that the sign resides in hi
-
-    when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
-    elif bits == 64: IntImpl[int32, uint32]
-    elif bits == 32: IntImpl[int16, uint16]
-    elif bits == 16: IntImpl[int8, uint8]
-    else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
+import
+  # Status lib
+  stew/bitops2
 
+when sizeof(int) == 8 and not defined(Stint32):
+  type Word* = uint64
 else:
-  template uintImpl*(bits: static[int]): untyped =
-    mixin UintImpl
-    when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
-    elif bits == 64: uint64
-    elif bits == 32: uint32
-    elif bits == 16: uint16
-    elif bits == 8: uint8
-    else: {.fatal: "Only power-of-2 >=8 supported" .}
+  type Word* = uint32
 
-  template intImpl*(bits: static[int]): untyped =
-    # int is implemented using a signed hi part and an unsigned lo part, given
-    # that the sign resides in hi
+type Word* = uint32
 
-    when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
-    elif bits == 64: int64
-    elif bits == 32: int32
-    elif bits == 16: int16
-    elif bits == 8: int8
-    else: {.fatal: "Only power-of-2 >=8 supported" .}
+func wordsRequired*(bits: int): int {.compileTime.} =
+  ## Compute the number of limbs required
+  ## from the **announced** bit length
+  (bits + WordBitWidth - 1) div WordBitWidth
 
 type
-  # ### Private ### #
-  UintImpl*[BaseUint] = object
-    when system.cpuEndian == littleEndian:
-      lo*, hi*: BaseUint
-    else:
-      hi*, lo*: BaseUint
-
-  IntImpl*[BaseInt, BaseUint] = object
-    # Ints are implemented in terms of uints
-    when system.cpuEndian == littleEndian:
-      lo*: BaseUint
-      hi*: BaseInt
-    else:
-      hi*: BaseInt
-      lo*: BaseUint
-
-  # ### Private ### #
+  Limbs*[N: static int] = array[N, BaseUint]
 
   StUint*[bits: static[int]] = object
-    data*: uintImpl(bits)
+    ## Stack-based integer
+    ## Unsigned
+    limbs*: Limbs[bits.wordsRequired]
 
   StInt*[bits: static[int]] = object
-    data*: intImpl(bits)
+    ## Stack-based integer
+    ## Signed
+    limbs*: Limbs[bits.wordsRequired]
+
+  Carry* = uint8  # distinct range[0'u8 .. 1]
+  Borrow* = uint8 # distinct range[0'u8 .. 1]
 
-template applyHiLo*(a: UintImpl | IntImpl, c: untyped): untyped =
-  ## Apply `c` to each of `hi` and `lo`
-  var res: type a
-  res.hi = c(a.hi)
-  res.lo = c(a.lo)
-  res
+const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
+const X86* = defined(amd64) or defined(i386)
 
-template applyHiLo*(a, b: UintImpl | IntImpl, c: untyped): untyped =
-  ## Apply `c` to each of `hi` and `lo`
-  var res: type a
-  res.hi = c(a.hi, b.hi)
-  res.lo = c(a.lo, b.lo)
-  res
+when sizeof(int) == 8 and GCC_Compatible:
+  type
+    uint128*{.importc: "unsigned __int128".} = object
 
 template leastSignificantWord*(num: SomeInteger): auto =
   num
 
-func leastSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
-  when num.lo is UintImpl:
-    num.lo.leastSignificantWord
+func leastSignificantWord*(limbs: Limbs): auto {.inline.} =
+  when cpuEndian == littleEndian:
+    limbs[0]
   else:
-    num.lo
+    limbs[^1]
 
-func mostSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
-  when num.hi is (UintImpl | IntImpl):
-    num.hi.mostSignificantWord
+func mostSignificantWord*(limbs: Limbs): auto {.inline.} =
+  when cpuEndian == littleEndian:
+    limbs[^1]
   else:
-    num.hi
+    limbs[0]
diff --git a/stint/private/primitives/addcarry_subborrow.nim b/stint/private/primitives/addcarry_subborrow.nim
new file mode 100644
index 0000000..c4e27df
--- /dev/null
+++ b/stint/private/primitives/addcarry_subborrow.nim
@@ -0,0 +1,169 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#            Add-with-carry and Sub-with-borrow
+#
+# ############################################################
+#
+# This file implements add-with-carry and sub-with-borrow
+#
+# It is currently (Mar 2020) impossible to have the compiler
+# generate optimal code in a generic way.
+#
+# On x86, addcarry_u64 intrinsic will generate optimal code
+# except for GCC.
+#
+# On other CPU architectures inline assembly might be desirable.
+# A compiler proof-of-concept is available in the "research" folder.
+#
+# See https://gcc.godbolt.org/z/2h768y
+# ```C
+# #include <stdint.h>
+# #include <x86intrin.h>
+#
+# void add256(uint64_t a[4], uint64_t b[4]){
+#   uint8_t carry = 0;
+#   for (int i = 0; i < 4; ++i)
+#     carry = _addcarry_u64(carry, a[i], b[i], &a[i]);
+# }
+# ```
+#
+# GCC
+# ```asm
+# add256:
+#         movq    (%rsi), %rax
+#         addq    (%rdi), %rax
+#         setc    %dl
+#         movq    %rax, (%rdi)
+#         movq    8(%rdi), %rax
+#         addb    $-1, %dl
+#         adcq    8(%rsi), %rax
+#         setc    %dl
+#         movq    %rax, 8(%rdi)
+#         movq    16(%rdi), %rax
+#         addb    $-1, %dl
+#         adcq    16(%rsi), %rax
+#         setc    %dl
+#         movq    %rax, 16(%rdi)
+#         movq    24(%rsi), %rax
+#         addb    $-1, %dl
+#         adcq    %rax, 24(%rdi)
+#         ret
+# ```
+#
+# Clang
+# ```asm
+# add256:
+#         movq    (%rsi), %rax
+#         addq    %rax, (%rdi)
+#         movq    8(%rsi), %rax
+#         adcq    %rax, 8(%rdi)
+#         movq    16(%rsi), %rax
+#         adcq    %rax, 16(%rdi)
+#         movq    24(%rsi), %rax
+#         adcq    %rax, 24(%rdi)
+#         retq
+# ```
+
+# ############################################################
+#
+#                     Intrinsics
+#
+# ############################################################
+
+# Note: GCC before 2017 had incorrect codegen in some cases:
+# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300
+
+when X86:
+  when defined(windows):
+    {.pragma: intrinsics, header:"<intrin.h>", nodecl.}
+  else:
+    {.pragma: intrinsics, header:"<x86intrin.h>", nodecl.}
+
+  func addcarry_u32(carryIn: Carry, a, b: uint32, sum: var uint32): Carry {.importc: "_addcarry_u32", intrinsics.}
+  func subborrow_u32(borrowIn: Borrow, a, b: uint32, diff: var uint32): Borrow {.importc: "_subborrow_u32", intrinsics.}
+
+  func addcarry_u64(carryIn: Carry, a, b: uint64, sum: var uint64): Carry {.importc: "_addcarry_u64", intrinsics.}
+  func subborrow_u64(borrowIn: Borrow, a, b:uint64, diff: var uint64): Borrow {.importc: "_subborrow_u64", intrinsics.}
+
+# ############################################################
+#
+#                     Public
+#
+# ############################################################
+
+func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} =
+  ## Addition with carry
+  ## (CarryOut, Sum) <- a + b + CarryIn
+  when X86:
+    cOut = addcarry_u32(cIn, a, b, sum)
+  else:
+    let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
+    sum = (uint32)(dblPrec)
+    cOut = Carry(dblPrec shr 32)
+
+func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
+  ## Substraction with borrow
+  ## (BorrowOut, Diff) <- a - b - borrowIn
+  when X86:
+    bOut = subborrow_u32(bIn, a, b, diff)
+  else:
+    let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
+    diff = (uint32)(dblPrec)
+    # On borrow the high word will be 0b1111...1111 and needs to be masked
+    bOut = Borrow((dblPrec shr 32) and 1)
+
+func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
+  ## Addition with carry
+  ## (CarryOut, Sum) <- a + b + CarryIn
+  when X86:
+    cOut = addcarry_u64(cIn, a, b, sum)
+  else:
+    block:
+      static:
+        doAssert GCC_Compatible
+        doAssert sizeof(int) == 8
+
+      var dblPrec {.noInit.}: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
+
+      # Don't forget to dereference the var param in C mode
+      when defined(cpp):
+        {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+        {.emit:[sum, " = (NU64)", dblPrec,";"].}
+      else:
+        {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+        {.emit:["*",sum, " = (NU64)", dblPrec,";"].}
+
+func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
+  ## Substraction with borrow
+  ## (BorrowOut, Diff) <- a - b - borrowIn
+  when X86:
+    bOut = subborrow_u64(bIn, a, b, diff)
+  else:
+    block:
+      static:
+        doAssert GCC_Compatible
+        doAssert sizeof(int) == 8
+
+      var dblPrec {.noInit.}: uint128
+      {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
+
+      # Don't forget to dereference the var param in C mode
+      # On borrow the high word will be 0b1111...1111 and needs to be masked
+      when defined(cpp):
+        {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+        {.emit:[diff, " = (NU64)", dblPrec,";"].}
+      else:
+        {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+        {.emit:["*",diff, " = (NU64)", dblPrec,";"].}
diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim
new file mode 100644
index 0000000..ef75040
--- /dev/null
+++ b/stint/private/primitives/extended_precision.nim
@@ -0,0 +1,130 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#               Extended precision primitives
+#
+# ############################################################
+
+import
+  ../datatypes,
+  ./addcarry_subborrow
+
+# ############################################################
+#
+#                     32-bit words
+#
+# ############################################################
+
+func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=
+  ## Division uint64 by uint32
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint32
+  ##   - if n_hi > d result is undefined
+  ##
+  ## To avoid issues, n_hi, n_lo, d should be normalized.
+  ## i.e. shifted (== multiplied by the same power of 2)
+  ## so that the most significant bit in d is set.
+  let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
+  let divisor = uint64(d)
+  q = uint32(dividend div divisor)
+  r = uint32(dividend mod divisor)
+
+func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  let dblPrec = uint64(a) * uint64(b)
+  lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding any c cannot overflow
+  let dblPrec = uint64(a) * uint64(b) + uint64(c)
+  lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
+  ##       and we have enough space to add again 0xFFFFFFFF without overflowing
+  let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
+  lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+# ############################################################
+#
+#                     64-bit words
+#
+# ############################################################
+
+when sizeof(int) == 8:
+  when defined(vcc):
+    from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
+  elif GCCCompatible:
+    when X86:
+      from ./extended_precision_x86_64_gcc import div2n1n
+      from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
+    else:
+      from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
+  export div2n1n, mul, muladd1, muladd2
+
+# ############################################################
+#
+#                  Composite primitives
+#
+# ############################################################
+
+func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
+  ## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
+  ## with r = (r2, r1, r0) a triple-word number
+  ## and d = (dHi, dLo) a double-word number
+  ## r2 and dHi are carries, either 0 or 1
+
+  var carry: Carry
+
+  # (r1, r0) <- a*b
+  # Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
+  mul(r1, r0, a, b)
+
+  # (r2, r1, r0) <- 2*a*b
+  # Then  (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
+  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
+  addC(carry, r0, r0, r0, Carry(0))
+  addC(r2, r1, r1, r1, carry)
+
+  # (r1, r0) <- (r1, r0) + c
+  # Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
+  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
+  addC(carry, r0, r0, c, Carry(0))
+  addC(carry, r1, r1, T(0), carry)
+
+  # (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
+  # (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
+  # summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
+  # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
+  addC(carry, r0, r0, dLo, Carry(0))
+  addC(carry, r1, r1, T(dHi), carry)
+
+func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
+  ## (t, u, v) <- (t, u, v) + a * b
+  var UV: array[2, T]
+  var carry: Carry
+  mul(UV[1], UV[0], a, b)
+  addC(carry, v, v, UV[0], Carry(0))
+  addC(carry, u, u, UV[1], carry)
+  t += T(carry)
diff --git a/stint/private/primitives/extended_precision_64bit_uint128.nim b/stint/private/primitives/extended_precision_64bit_uint128.nim
new file mode 100644
index 0000000..7861427
--- /dev/null
+++ b/stint/private/primitives/extended_precision_64bit_uint128.nim
@@ -0,0 +1,95 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+# Extended precision primitives on GCC & Clang (all CPU archs)
+#
+# ############################################################
+
+static:
+  doAssert GCC_Compatible
+  doAssert sizeof(int) == 8
+
+func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
+  ##   - if n_hi > d result is undefined
+  var dblPrec {.noInit.}: uint128
+  {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
+
+  # Don't forget to dereference the var param in C mode
+  when defined(cpp):
+    {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].}
+    {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].}
+  else:
+    {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
+    {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
+
+func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  block:
+    var dblPrec {.noInit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  block:
+    var dblPrec {.noInit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  block:
+    var dblPrec {.noInit.}: uint128
+    {.emit:[
+      dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,
+               " + (unsigned __int128)",c1," + (unsigned __int128)",c2,";"
+    ].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
diff --git a/stint/private/primitives/extended_precision_x86_64_gcc.nim b/stint/private/primitives/extended_precision_x86_64_gcc.nim
new file mode 100644
index 0000000..0e18c7f
--- /dev/null
+++ b/stint/private/primitives/extended_precision_x86_64_gcc.nim
@@ -0,0 +1,57 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#   Extended precision primitives for X86-64 on GCC & Clang
+#
+# ############################################################
+
+static:
+  doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc))
+  doAssert sizeof(int) == 8
+  doAssert X86
+
+func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # DIV r/m64
+  # Divide RDX:RAX (n_hi:n_lo) by r/m64
+  #
+  # Inputs
+  #   - numerator high word in RDX,
+  #   - numerator low word in RAX,
+  #   - divisor as r/m parameter (register or memory at the compiler discretion)
+  # Result
+  #   - Quotient in RAX
+  #   - Remainder in RDX
+
+  # 1. name the register/memory "divisor"
+  # 2. don't forget to dereference the var hidden pointer
+  # 3. -
+  # 4. no clobbered registers beside explicitly used RAX and RDX
+  when defined(cpp):
+    asm """
+      divq %[divisor]
+      : "=a" (`q`), "=d" (`r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
+  else:
+    asm """
+      divq %[divisor]
+      : "=a" (`*q`), "=d" (`*r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
diff --git a/stint/private/primitives/extended_precision_x86_64_msvc.nim b/stint/private/primitives/extended_precision_x86_64_msvc.nim
new file mode 100644
index 0000000..9adcd32
--- /dev/null
+++ b/stint/private/primitives/extended_precision_x86_64_msvc.nim
@@ -0,0 +1,87 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../datatypes,
+  ./addcarry_subborrow
+
+# ############################################################
+#
+#      Extended precision primitives for X86-64 on MSVC
+#
+# ############################################################
+
+static:
+  doAssert defined(vcc)
+  doAssert sizeof(int) == 8
+  doAssert X86
+
+func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}
+  ## Division 128 by 64, Microsoft only, 64-bit only,
+  ## returns quotient as return value remainder as var parameter
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
+  ## (hi, lo) <-- a * b
+  ## Return value is the low word
+
+func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
+
+    # TODO !!! - Replace by constant-time, portable, non-assembly version
+    #          -> use uint128? Compiler might add unwanted branches
+    q = udiv128(n_hi, n_lo, d, r)
+
+func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  lo = umul128(a, b, hi)
+
+func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  var carry: Carry
+  lo = umul128(a, b, hi)
+  addC(carry, lo, lo, c, Carry(0))
+  addC(carry, hi, hi, 0, carry)
+
+func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  # For speed this could be implemented with parallel pipelined carry chains
+  # via MULX + ADCX + ADOX
+  var carry1, carry2: Carry
+
+  lo = umul128(a, b, hi)
+  # Carry chain 1
+  addC(carry1, lo, lo, c1, Carry(0))
+  addC(carry1, hi, hi, 0, carry1)
+  # Carry chain 2
+  addC(carry2, lo, lo, c2, Carry(0))
+  addC(carry2, hi, hi, 0, carry2)

From 36cc2b2e0282e7869bcc5def5fd5710c7ea94182 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 12 Jun 2020 19:01:05 +0200
Subject: [PATCH 02/26] Implement comparison

---
 stint/private/datatypes.nim       | 40 +++++++++++++++++++++++++++++--
 stint/private/uint_comparison.nim | 38 ++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 46da4be..2dd4e67 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -16,7 +16,7 @@ when sizeof(int) == 8 and not defined(Stint32):
 else:
   type Word* = uint32
 
-type Word* = uint32
+const WordBitWidth = sizeof(Word) * 8
 
 func wordsRequired*(bits: int): int {.compileTime.} =
   ## Compute the number of limbs required
@@ -24,7 +24,7 @@ func wordsRequired*(bits: int): int {.compileTime.} =
   (bits + WordBitWidth - 1) div WordBitWidth
 
 type
-  Limbs*[N: static int] = array[N, BaseUint]
+  Limbs*[N: static int] = array[N, Word]
 
   StUint*[bits: static[int]] = object
     ## Stack-based integer
@@ -60,3 +60,39 @@ func mostSignificantWord*(limbs: Limbs): auto {.inline.} =
     limbs[^1]
   else:
     limbs[0]
+
+iterator leastToMostSig*(limbs: Limbs): Word =
+  ## Iterate from least to most significant word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< limbs.len:
+      yield limbs[i]
+  else:
+    for i in countdown(limbs.len-1, 0):
+      yield limbs[i]
+
+iterator leastToMostSig*(limbs: var Limbs): var Word =
+  ## Iterate from least to most significant word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< limbs.len:
+      yield limbs[i]
+  else:
+    for i in countdown(limbs.len-1, 0):
+      yield limbs[i]
+
+iterator leastToMostSig*(aLimbs, bLimbs: Limbs): (Word, Word) =
+  ## Iterate from least to most significant word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< limbs.len:
+      yield (aLimbs[i], bLimbs[i])
+  else:
+    for i in countdown(limbs.len-1, 0):
+      yield (aLimbs[i], bLimbs[i])
+
+iterator leastToMostSig*(aLimbs: var Limbs, bLimbs: Limbs): (var Word, Word) =
+  ## Iterate from least to most significant word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< limbs.len:
+      yield (aLimbs[i], bLimbs[i])
+  else:
+    for i in countdown(limbs.len-1, 0):
+      yield (aLimbs[i], bLimbs[i])
diff --git a/stint/private/uint_comparison.nim b/stint/private/uint_comparison.nim
index 2364e5c..eafa9b0 100644
--- a/stint/private/uint_comparison.nim
+++ b/stint/private/uint_comparison.nim
@@ -7,36 +7,46 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./datatypes
+import
+  ./datatypes,
+  ./primitives/addcarry_subborrow
 
 func isZero*(n: SomeUnsignedInt): bool {.inline.} =
   n == 0
 
-func isZero*(n: UintImpl): bool {.inline.} =
-  n.hi.isZero and n.lo.isZero
+func isZero*(limbs: Limbs): bool {.inline.} =
+  for word in limbs:
+    if not word.isZero():
+      return false
+  return true
 
-func `<`*(x, y: UintImpl): bool {.inline.}=
+func `<`*(x, y: Limbs): bool {.inline.}=
   # Lower comparison for multi-precision integers
-  x.hi < y.hi or
-    (x.hi == y.hi and x.lo < y.lo)
+  var diff: Word
+  var borrow: Borrow
+  for wx, wy in leastToMostSig(x, y):
+    subB(borrow, diff, wx, wy, borrow)
+  return bool(borrow)
 
-func `==`*(x, y: UintImpl): bool {.inline.}=
+func `==`*(x, y: Limbs): bool {.inline.}=
   # Equal comparison for multi-precision integers
-  x.hi == y.hi and x.lo == y.lo
+  for wx, wy in leastToMostSig(x, y):
+    if wx != wy:
+      return false
+  return true
 
-func `<=`*(x, y: UintImpl): bool {.inline.}=
+func `<=`*(x, y: Limbs): bool {.inline.}=
   # Lower or equal comparison for multi-precision integers
-  x.hi < y.hi or
-    (x.hi == y.hi and x.lo <= y.lo)
+  not(y < x)
 
 func isEven*(x: SomeUnsignedInt): bool {.inline.} =
   (x and 1) == 0
 
-func isEven*(x: UintImpl): bool {.inline.}=
-  x.lo.isEven
+func isEven*(x: Limbs): bool {.inline.}=
+  x.leastSignificantWord.isEven
 
 func isOdd*(x: SomeUnsignedInt): bool {.inline.} =
   not x.isEven
 
-func isOdd*(x: UintImpl): bool {.inline.}=
+func isOdd*(x: Limbs): bool {.inline.}=
   not x.isEven

From cbbffe4e9c100f3dd35eeced5458104317af9c18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 12 Jun 2020 19:23:03 +0200
Subject: [PATCH 03/26] reimplement bitwise

---
 stint/private/datatypes.nim        | 11 +++-
 stint/private/uint_bitwise_ops.nim | 82 ++++++++++++++----------------
 2 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 2dd4e67..86760e0 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -16,7 +16,7 @@ when sizeof(int) == 8 and not defined(Stint32):
 else:
   type Word* = uint32
 
-const WordBitWidth = sizeof(Word) * 8
+const WordBitWidth* = sizeof(Word) * 8
 
 func wordsRequired*(bits: int): int {.compileTime.} =
   ## Compute the number of limbs required
@@ -96,3 +96,12 @@ iterator leastToMostSig*(aLimbs: var Limbs, bLimbs: Limbs): (var Word, Word) =
   else:
     for i in countdown(limbs.len-1, 0):
       yield (aLimbs[i], bLimbs[i])
+
+iterator leastToMostSig*(cLimbs: var Limbs, aLimbs: Limbs, bLimbs: Limbs): (var Word, Word, Word) =
+  ## Iterate from least to most significant word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< limbs.len:
+      yield (cLimbs[i], aLimbs[i], bLimbs[i])
+  else:
+    for i in countdown(limbs.len-1, 0):
+      yield (cLimbs[i], aLimbs[i], bLimbs[i])
diff --git a/stint/private/uint_bitwise_ops.nim b/stint/private/uint_bitwise_ops.nim
index d208e52..49e6088 100644
--- a/stint/private/uint_bitwise_ops.nim
+++ b/stint/private/uint_bitwise_ops.nim
@@ -7,56 +7,52 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./datatypes, ./bitops2_priv
+import  ./datatypes
 
-func `not`*(x: UintImpl): UintImpl {.inline.}=
+func `not`*(x: Limbs): Limbs {.inline.}=
   ## Bitwise complement of unsigned integer x
-  applyHiLo(x, `not`)
+  for wr, wx in leastToMostSig(result, x):
+    wr = not wx
 
-func `or`*(x, y: UintImpl): UintImpl {.inline.}=
+func `or`*(x, y: Limbs): Limbs {.inline.}=
   ## `Bitwise or` of numbers x and y
-  applyHiLo(x, y, `or`)
+  for wr, wx, wy in leastToMostSig(result, x, y):
+    wr = wx or wy
 
-func `and`*(x, y: UintImpl): UintImpl {.inline.}=
+func `and`*(x, y: Limbs): Limbs {.inline.}=
   ## `Bitwise and` of numbers x and y
-  applyHiLo(x, y, `and`)
+  for wr, wx, wy in leastToMostSig(result, x, y):
+    wr = wx and wy
 
-func `xor`*(x, y: UintImpl): UintImpl {.inline.}=
+func `xor`*(x, y: Limbs): Limbs {.inline.}=
   ## `Bitwise xor` of numbers x and y
-  applyHiLo(x, y, `xor`)
-
-func `shr`*(x: UintImpl, y: SomeInteger): UintImpl {.inline.}
-  # Forward declaration
-
-func `shl`*(x: UintImpl, y: SomeInteger): UintImpl {.inline.}=
-  ## Compute the `shift left` operation of x and y
-  # Note: inlining this poses codegen/aliasing issue when doing `x = x shl 1`
-
-  # TODO: would it be better to reimplement this with words iteration?
-  const halfSize: type(y) = bitsof(x) div 2
-
-  if y == 0:
-    return x
-  elif y == halfSize:
-    result.hi = x.lo
-  elif y < halfSize:
-    result.hi = (x.hi shl y) or (x.lo shr (halfSize - y))
-    result.lo = x.lo shl y
+  for wr, wx, wy in leastToMostSig(result, x, y):
+    wr = wx xor wy
+
+func `shr`*(x: Limbs, k: SomeInteger): Limbs {.inline.} =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+  when cpuEndian == littleEndian:
+    for i in 0 ..< x.len-1:
+      result[i] = (x[i] shr k) or (x[i+1] shl (WordBitWidth - k))
+    result[^1] = x[^1] shr k
   else:
-    result.hi = x.lo shl (y - halfSize)
-
-func `shr`*(x: UintImpl, y: SomeInteger): UintImpl {.inline.}=
-  ## Compute the `shift right` operation of x and y
-  ## Similar to C standard, result is undefined if y is bigger
-  ## than the number of bits in x.
-  const halfSize: type(y) = bitsof(x) div 2
-
-  if y == 0:
-    return x
-  elif y == halfSize:
-    result.lo = x.hi
-  elif y < halfSize:
-    result.lo = (x.lo shr y) or (x.hi shl (halfSize - y))
-    result.hi = x.hi shr y
+    for i in countdown(x.len-1, 1):
+      result[i] = (x[i] shr k) or (x[i-1] shl (WordBitWidth - k))
+    result[0] = x[0] shr k
+
+func `shl`*(x: Limbs, k: SomeInteger): Limbs {.inline.}=
+  ## Compute the `shift left` operation of x and k
+  when cpuEndian == littleEndian:
+    result[0] = x[0] shl k
+    for i in 1 ..< x.len:
+      result[i] = (x[i] shl k) or (x[i-1] shr (WordBitWidth - k))
   else:
-    result.lo = x.hi shr (y - halfSize)
+    result[^1] = x[^1] shl k
+    for i in countdown(x.len-2, 0):
+      result[i] = (x[i] shl k) or (x[i+1] shr (WordBitWidth - k))

From 206ffa92cfe7c3a6a31a420a46752ca55e7f35d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 12 Jun 2020 19:59:03 +0200
Subject: [PATCH 04/26] Implement multiprecision addition / substraction

---
 stint/private/datatypes.nim        | 12 +++----
 stint/private/uint_addsub.nim      | 52 ++++++++++++++++--------------
 stint/private/uint_bitwise_ops.nim | 14 ++++----
 stint/private/uint_comparison.nim  | 20 ++++++------
 4 files changed, 53 insertions(+), 45 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 86760e0..987af07 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -82,26 +82,26 @@ iterator leastToMostSig*(limbs: var Limbs): var Word =
 iterator leastToMostSig*(aLimbs, bLimbs: Limbs): (Word, Word) =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< limbs.len:
+    for i in 0 ..< aLimbs.len:
       yield (aLimbs[i], bLimbs[i])
   else:
-    for i in countdown(limbs.len-1, 0):
+    for i in countdown(aLimbs.len-1, 0):
       yield (aLimbs[i], bLimbs[i])
 
 iterator leastToMostSig*(aLimbs: var Limbs, bLimbs: Limbs): (var Word, Word) =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< limbs.len:
+    for i in 0 ..< aLimbs.len:
       yield (aLimbs[i], bLimbs[i])
   else:
-    for i in countdown(limbs.len-1, 0):
+    for i in countdown(aLimbs.len-1, 0):
       yield (aLimbs[i], bLimbs[i])
 
 iterator leastToMostSig*(cLimbs: var Limbs, aLimbs: Limbs, bLimbs: Limbs): (var Word, Word, Word) =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< limbs.len:
+    for i in 0 ..< aLimbs.len:
       yield (cLimbs[i], aLimbs[i], bLimbs[i])
   else:
-    for i in countdown(limbs.len-1, 0):
+    for i in countdown(aLimbs.len-1, 0):
       yield (cLimbs[i], aLimbs[i], bLimbs[i])
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index c4f11bc..135a1b2 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -7,36 +7,40 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./conversion, ./initialization,
-        ./datatypes,
-        ./uint_comparison,
-        ./uint_bitwise_ops
+import
+  ./datatypes, ./uint_comparison, ./uint_bitwise_ops,
+  ./primitives/addcarry_subborrow
 
 # ############ Addition & Substraction ############ #
+{.push raises: [], inline, noInit, gcsafe.}
 
-func `+`*(x, y: UintImpl): UintImpl {.inline.}
-  # Forward declaration
+func `+`*(x, y: Limbs): Limbs =
+  # Addition for multi-precision unsigned int
+  var carry = Carry(0)
+  for wr, wx, wy in leastToMostSig(result, x, y):
+    addC(carry, wr, wx, wy, carry)
 
-func `+=`*(x: var UintImpl, y: UintImpl) {.inline.}=
+func `+=`*(x: var Limbs, y: Limbs) =
   ## In-place addition for multi-precision unsigned int
-  type SubTy = type x.lo
-  x.lo += y.lo
-  x.hi += (x.lo < y.lo).toSubtype(SubTy) + y.hi # This helps the compiler produce ADC (add with carry)
-
-func `+`*(x, y: UintImpl): UintImpl {.inline.}=
-  # Addition for multi-precision unsigned int
-  result = x
-  result += y
+  var carry = Carry(0)
+  for wx, wy in leastToMostSig(x, y):
+    addC(carry, wx, wx, wy, carry)
 
-func `-`*(x, y: UintImpl): UintImpl {.inline.}=
+func `-`*(x, y: Limbs): Limbs =
   # Substraction for multi-precision unsigned int
-  type SubTy = type x.lo
-  result.lo = x.lo - y.lo
-  result.hi = x.hi - y.hi - (x.lo < y.lo).toSubtype(SubTy) # This might (?) help the compiler produce SBB (sub with borrow)
+  var borrow = Borrow(0)
+  for wr, wx, wy in leastToMostSig(result, x, y):
+    subB(borrow, wr, wx, wy, borrow)
 
-func `-=`*(x: var UintImpl, y: UintImpl) {.inline.}=
+func `-=`*(x: var Limbs, y: Limbs) =
   ## In-place substraction for multi-precision unsigned int
-  x = x - y
-
-func inc*(x: var UintImpl){.inline.}=
-  x += one(type x)
+  var borrow = Borrow(0)
+  for wx, wy in leastToMostSig(x, y):
+    subB(borrow, wx, wx, wy, borrow)
+
+func inc*(x: var Limbs, w: SomeUnsignedInt = 1) =
+  var carry = Carry(0)
+  when cpuEndian == littleEndian:
+    addC(carry, x[0], x[0], w, carry)
+    for i in 1 ..< x.len:
+      addC(carry, x[i], x[i], 0, carry)
diff --git a/stint/private/uint_bitwise_ops.nim b/stint/private/uint_bitwise_ops.nim
index 49e6088..0afff9f 100644
--- a/stint/private/uint_bitwise_ops.nim
+++ b/stint/private/uint_bitwise_ops.nim
@@ -9,27 +9,29 @@
 
 import  ./datatypes
 
-func `not`*(x: Limbs): Limbs {.inline.}=
+{.push raises: [], inline, noInit, gcsafe.}
+
+func `not`*(x: Limbs): Limbs =
   ## Bitwise complement of unsigned integer x
   for wr, wx in leastToMostSig(result, x):
     wr = not wx
 
-func `or`*(x, y: Limbs): Limbs {.inline.}=
+func `or`*(x, y: Limbs): Limbs =
   ## `Bitwise or` of numbers x and y
   for wr, wx, wy in leastToMostSig(result, x, y):
     wr = wx or wy
 
-func `and`*(x, y: Limbs): Limbs {.inline.}=
+func `and`*(x, y: Limbs): Limbs =
   ## `Bitwise and` of numbers x and y
   for wr, wx, wy in leastToMostSig(result, x, y):
     wr = wx and wy
 
-func `xor`*(x, y: Limbs): Limbs {.inline.}=
+func `xor`*(x, y: Limbs): Limbs =
   ## `Bitwise xor` of numbers x and y
   for wr, wx, wy in leastToMostSig(result, x, y):
     wr = wx xor wy
 
-func `shr`*(x: Limbs, k: SomeInteger): Limbs {.inline.} =
+func `shr`*(x: Limbs, k: SomeInteger): Limbs =
   ## Shift right by k.
   ##
   ## k MUST be less than the base word size (2^32 or 2^64)
@@ -46,7 +48,7 @@ func `shr`*(x: Limbs, k: SomeInteger): Limbs {.inline.} =
       result[i] = (x[i] shr k) or (x[i-1] shl (WordBitWidth - k))
     result[0] = x[0] shr k
 
-func `shl`*(x: Limbs, k: SomeInteger): Limbs {.inline.}=
+func `shl`*(x: Limbs, k: SomeInteger): Limbs =
   ## Compute the `shift left` operation of x and k
   when cpuEndian == littleEndian:
     result[0] = x[0] shl k
diff --git a/stint/private/uint_comparison.nim b/stint/private/uint_comparison.nim
index eafa9b0..48832da 100644
--- a/stint/private/uint_comparison.nim
+++ b/stint/private/uint_comparison.nim
@@ -11,16 +11,18 @@ import
   ./datatypes,
   ./primitives/addcarry_subborrow
 
-func isZero*(n: SomeUnsignedInt): bool {.inline.} =
+{.push raises: [], inline, noInit, gcsafe.}
+
+func isZero*(n: SomeUnsignedInt): bool =
   n == 0
 
-func isZero*(limbs: Limbs): bool {.inline.} =
+func isZero*(limbs: Limbs): bool =
   for word in limbs:
     if not word.isZero():
       return false
   return true
 
-func `<`*(x, y: Limbs): bool {.inline.}=
+func `<`*(x, y: Limbs): bool =
   # Lower comparison for multi-precision integers
   var diff: Word
   var borrow: Borrow
@@ -28,25 +30,25 @@ func `<`*(x, y: Limbs): bool {.inline.}=
     subB(borrow, diff, wx, wy, borrow)
   return bool(borrow)
 
-func `==`*(x, y: Limbs): bool {.inline.}=
+func `==`*(x, y: Limbs): bool =
   # Equal comparison for multi-precision integers
   for wx, wy in leastToMostSig(x, y):
     if wx != wy:
       return false
   return true
 
-func `<=`*(x, y: Limbs): bool {.inline.}=
+func `<=`*(x, y: Limbs): bool =
   # Lower or equal comparison for multi-precision integers
   not(y < x)
 
-func isEven*(x: SomeUnsignedInt): bool {.inline.} =
+func isEven*(x: SomeUnsignedInt): bool =
   (x and 1) == 0
 
-func isEven*(x: Limbs): bool {.inline.}=
+func isEven*(x: Limbs): bool =
   x.leastSignificantWord.isEven
 
-func isOdd*(x: SomeUnsignedInt): bool {.inline.} =
+func isOdd*(x: SomeUnsignedInt): bool =
   not x.isEven
 
-func isOdd*(x: Limbs): bool {.inline.}=
+func isOdd*(x: Limbs): bool =
   not x.isEven

From a0dec54c12926bac09d615400e511a9884febb13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 12 Jun 2020 20:05:40 +0200
Subject: [PATCH 05/26] Implement multiplication

---
 stint/private/datatypes.nim   |  30 +++++
 stint/private/uint_addsub.nim |   2 +-
 stint/private/uint_mul.nim    | 226 +++++++++++-----------------------
 3 files changed, 103 insertions(+), 155 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 987af07..4fa1cfb 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -105,3 +105,33 @@ iterator leastToMostSig*(cLimbs: var Limbs, aLimbs: Limbs, bLimbs: Limbs): (var
   else:
     for i in countdown(aLimbs.len-1, 0):
       yield (cLimbs[i], aLimbs[i], bLimbs[i])
+
+import std/macros
+
+proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
+  # Replace "what" ident node by "by"
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      if node.eqIdent(what):
+        return by
+      return node
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
+  ## staticFor [min inclusive, max exclusive)
+  result = newStmtList()
+  for i in start ..< stopEx:
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index 135a1b2..c99795e 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -8,7 +8,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ./datatypes, ./uint_comparison, ./uint_bitwise_ops,
+  ./datatypes,
   ./primitives/addcarry_subborrow
 
 # ############ Addition & Substraction ############ #
diff --git a/stint/private/uint_mul.nim b/stint/private/uint_mul.nim
index 055289c..022cef5 100644
--- a/stint/private/uint_mul.nim
+++ b/stint/private/uint_mul.nim
@@ -7,160 +7,78 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  macros,
-        ./conversion,
-        ./initialization,
-        ./datatypes,
-        ./uint_comparison,
-        ./uint_addsub
+import
+  ./datatypes,
+  ./primitives/extended_precision
 
 # ################### Multiplication ################### #
-
-func lo(x: uint64): uint64 {.inline.} =
-  const
-    p: uint64 = 32
-    base: uint64 = 1'u64 shl p
-    mask: uint64 = base - 1
-  result = x and mask
-
-func hi(x: uint64): uint64 {.inline.} =
-  const
-    p = 32
-  result = x shr p
-
-# No generic, somehow Nim is given ambiguous call with the T: UintImpl overload
-func extPrecMul*(result: var UintImpl[uint8], x, y: uint8) {.inline.}=
-  ## Extended precision multiplication
-  result = cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-func extPrecMul*(result: var UintImpl[uint16], x, y: uint16) {.inline.}=
-  ## Extended precision multiplication
-  result = cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-func extPrecMul*(result: var UintImpl[uint32], x, y: uint32) {.inline.}=
-  ## Extended precision multiplication
-  result = cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-func extPrecAddMul[T: uint8 or uint16 or uint32](result: var UintImpl[T], x, y: T) {.inline.}=
-  ## Extended precision fused in-place addition & multiplication
-  result += cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
-  const
-    p = 64 div 2
-    base: uint64 = 1'u64 shl p
-
-  var
-    x0, x1, x2, x3: uint64
-
-  let
-    ul = lo(u)
-    uh = hi(u)
-    vl = lo(v)
-    vh = hi(v)
-
-  x0 = ul * vl
-  x1 = ul * vh
-  x2 = uh * vl
-  x3 = uh * vh
-
-  x1 += hi(x0)          # This can't carry
-  x1 += x2              # but this can
-  if x1 < x2:           # if carry, add it to x3
-    x3 += base
-
-  op(result.hi, x3 + hi(x1))
-  op(result.lo, (x1 shl p) or lo(x0))
-
-func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
-  ## Extended precision multiplication
-  extPrecMulImpl(result, `=`, u, v)
-
-func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
-  ## Extended precision fused in-place addition & multiplication
-  extPrecMulImpl(result, `+=`, u, v)
-
-macro eqSym(x, y: untyped): untyped =
-  let eq = $x == $y # Unfortunately eqIdent compares to string.
-  result = newLit eq
-
-func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
-func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
-  # Forward declaration
-
-template extPrecMulImpl*[T](result: var UintImpl[UintImpl[T]], op: untyped, x, y: UintImpl[T]) =
-  # See details at
-  # https://en.wikipedia.org/wiki/Karatsuba_algorithm
-  # https://locklessinc.com/articles/256bit_arithmetic/
-  # https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
-  #
-  # We use the naive school grade multiplication instead of Karatsuba I.e.
-  # z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
+{.push raises: [], gcsafe.}
+
+func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Multi-precision multiplication
+  ## r <- a*b
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
+  ## The result will be truncated, i.e. it will be
+  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Word(0)
+  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+
+  staticFor i, 0, min(a.len+b.len, r.len):
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    z[i] = v
+    v = u
+    u = t
+    t = Word(0)
+
+  r = z
+
+func prod_high_words*[rLen, aLen, bLen](
+       r: var Limbs[rLen],
+       a: Limbs[aLen], b: Limbs[bLen],
+       lowestWordIndex: static int) =
+  ## Multi-precision multiplication keeping only high words
+  ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
+  ## The result will be truncated, i.e. it will be
+  ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
   #
-  # On modern architecture:
-  #   - addition and multiplication have the same cost
-  #   - Karatsuba would require to deal with potentially negative intermediate result
-  #     and introduce branching
-  #   - More total operations means more register moves
-
-  var z1: type x
-
-  # Low part and hi part - z0 & z2
-  when eqSym(op, `+=`):
-    extPrecAddMul(result.lo, x.lo, y.lo)
-    extPrecAddMul(result.hi, x.hi, y.hi)
-  else:
-    extPrecMul(result.lo, x.lo, y.lo)
-    extPrecMul(result.hi, x.hi, y.hi)
-
-  ## TODO - fuse those parts and reduce the number of carry checks
-  # Middle part - z1 - 1st mul
-  extPrecMul(z1, x.hi, y.lo)
-  result.lo.hi += z1.lo
-  if result.lo.hi < z1.lo:
-    inc result.hi
-
-  result.hi.lo += z1.hi
-  if result.hi.lo < z1.hi:
-    inc result.hi.hi
-
-  # Middle part - z1 - 2nd mul
-  extPrecMul(z1, x.lo, y.hi)
-  result.lo.hi += z1.lo
-  if result.lo.hi < z1.lo:
-    inc result.hi
-
-  result.hi.lo += z1.hi
-  if result.hi.lo < z1.hi:
-    inc result.hi.hi
-
-func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
-  ## Extended precision fused in-place addition & multiplication
-  extPrecMulImpl(result, `+=`, u, v)
-
-func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
-  ## Extended precision multiplication
-  extPrecMulImpl(result, `=`, u, v)
-
-func `*`*[T](x, y: UintImpl[T]): UintImpl[T] {.inline.}=
-  ## Multiplication for multi-precision unsigned uint
-  #
-  # For our representation, it is similar to school grade multiplication
-  # Consider hi and lo as if they were digits
-  #
-  #     12
-  # X   15
-  # ------
-  #     10   lo*lo -> z0
-  #     5    hi*lo -> z1
-  #     2    lo*hi -> z1
-  #    10    hi*hi -- z2
-  # ------
-  #    180
-  #
-  # If T is a type
-  # For T * T --> T we don't need to compute z2 as it always overflow
-  # For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
-
-  extPrecMul(result, x.lo, y.lo)
-  result.hi += x.lo * y.hi + x.hi * y.lo
+  # This is useful for
+  # - Barret reduction
+  # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
+  #   with K and C known at compile-time.
+  #   We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
+  #   Precompute P = K*M/C at compile-time
+  #   and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
+  #   i.e. prod_high_words(result, P, a, w)
+
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Word(0) # Will raise warning on empty iterations
+  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+
+  # The previous 2 columns can affect the lowest word due to carries
+  # but not the ones before (we accumulate in 3 words (t, u, v))
+  const w = lowestWordIndex - 2
+
+  staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    when i >= lowestWordIndex:
+      z[i-lowestWordIndex] = v
+    v = u
+    u = t
+    t = Word(0)
+
+  r = z

From 7f6c588ce35ff494593988d2e6151494fa2d2204 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 12 Jun 2020 23:53:08 +0200
Subject: [PATCH 06/26] Passing addition tests (however simple bitwise ops
 crash the int128 VM ... during compilation)

---
 stint.nim                                     |  19 +-
 stint/bitops2.nim                             |  16 -
 stint/endians2.nim                            | 174 +++--
 stint/io.nim                                  | 598 ++++++++----------
 stint/private/bitops2_priv.nim                |  58 --
 stint/private/compiletime_helpers.nim         |  82 ---
 stint/private/datatypes.nim                   |  79 ++-
 stint/private/endians2_priv.nim               |  26 -
 stint/private/initialization.nim              |  19 -
 .../private/primitives/addcarry_subborrow.nim | 101 +--
 .../primitives/compiletime_fallback.nim       | 108 ++++
 stint/private/uint_addsub.nim                 |  46 --
 stint/private/uint_bitwise_ops.nim            |  60 --
 stint/private/uint_comparison.nim             |  54 --
 stint/private/uint_highlow.nim                |  16 -
 stint/private/uint_mul.nim                    |  10 +-
 stint/uintops.nim                             | 230 +++++++
 17 files changed, 855 insertions(+), 841 deletions(-)
 delete mode 100644 stint/bitops2.nim
 delete mode 100644 stint/private/bitops2_priv.nim
 delete mode 100644 stint/private/compiletime_helpers.nim
 delete mode 100644 stint/private/endians2_priv.nim
 delete mode 100644 stint/private/initialization.nim
 create mode 100644 stint/private/primitives/compiletime_fallback.nim
 delete mode 100644 stint/private/uint_addsub.nim
 delete mode 100644 stint/private/uint_bitwise_ops.nim
 delete mode 100644 stint/private/uint_comparison.nim
 delete mode 100644 stint/private/uint_highlow.nim
 create mode 100644 stint/uintops.nim

diff --git a/stint.nim b/stint.nim
index 2b7c29a..2631d54 100644
--- a/stint.nim
+++ b/stint.nim
@@ -7,12 +7,15 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import stint/[bitops2, endians2, intops, io, modular_arithmetic, literals_stint]
-export bitops2, endians2, intops, io, modular_arithmetic, literals_stint
+# import stint/[bitops2, endians2, intops, io, modular_arithmetic, literals_stint]
+# export bitops2, endians2, intops, io, modular_arithmetic, literals_stint
+
+import stint/[io, uintops, bitops2]
+export io, uintops, bitops2
 
 type
-  Int128* = StInt[128]
-  Int256* = StInt[256]
+  # Int128* = Stint[128]
+  # Int256* = Stint[256]
   UInt128* = StUint[128]
   UInt256* = StUint[256]
 
@@ -22,8 +25,8 @@ func u128*(s: string): UInt128 {.inline.} = s.parse(UInt128)
 func u256*(n: SomeInteger): UInt256 {.inline.} = n.stuint(256)
 func u256*(s: string): UInt256 {.inline.} = s.parse(UInt256)
 
-func i128*(n: SomeInteger): Int128 {.inline.} = n.stint(128)
-func i128*(s: string): Int128 {.inline.} = s.parse(Int128)
+# func i128*(n: SomeInteger): Int128 {.inline.} = n.stint(128)
+# func i128*(s: string): Int128 {.inline.} = s.parse(Int128)
 
-func i256*(n: SomeInteger): Int256 {.inline.} = n.stint(256)
-func i256*(s: string): Int256 {.inline.} = s.parse(Int256)
+# func i256*(n: SomeInteger): Int256 {.inline.} = n.stint(256)
+# func i256*(s: string): Int256 {.inline.} = s.parse(Int256)
diff --git a/stint/bitops2.nim b/stint/bitops2.nim
deleted file mode 100644
index 9bad7c8..0000000
--- a/stint/bitops2.nim
+++ /dev/null
@@ -1,16 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./private/[bitops2_priv, datatypes]
-
-func countOnes*(x: StUint): int {.inline.} = countOnes(x.data)
-func parity*(x: StUint): int {.inline.} = parity(x.data)
-func firstOne*(x: StUint): int {.inline.} = firstOne(x.data)
-func leadingZeros*(x: StUint): int {.inline.} = leadingZeros(x.data)
-func trailingZeros*(x: StUint): int {.inline.} = trailingZeros(x.data)
diff --git a/stint/endians2.nim b/stint/endians2.nim
index be6beea..6232a94 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -7,16 +7,21 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import private/[bitops2_priv, endians2_priv, datatypes, compiletime_helpers]
+import private/datatypes
 
 import stew/endians2
 export endians2
 
-func swapBytes*(x: StUint): StUint {.inline.} = StUint(data: swapBytes(x.data))
+{.push raises: [IndexError], noInit, gcsafe.}
 
 func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian):
     array[bits div 8, byte] {.inline.} =
-  toBytes(x.data, endian)
+  when endian == system.cpuEndian:
+    for i in 0 ..< x.limbs.len:
+      result[i * sizeof(Word)] = x.limbs[i].toBytes()
+  else:
+    for i in 0 ..< x.limbs.len:
+      result[i * sizeof(Word)] = x.limbs[^i].toBytes()
 
 func toBytesLE*[bits: static int](x: StUint[bits]):
     array[bits div 8, byte] {.inline.} =
@@ -26,83 +31,138 @@ func toBytesBE*[bits: static int](x: StUint[bits]):
     array[bits div 8, byte] {.inline.} =
   toBytes(x, bigEndian)
 
-func fromBytes*[bits: static int](
+func fromBytesBE*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: array[bits div 8, byte],
-    endian: Endianness = system.cpuEndian): T {.inline, noinit.} =
+    x: openArray[byte]): T =
+  ## Read big endian bytes and convert to an integer. At runtime, v must contain
+  ## at least sizeof(T) bytes. Native endianess is used which is not
+  ## portable! (i.e. use fixed-endian byte array or hex for serialization)
 
-  when nimvm:
-    copyFromArray(result.data, x)
-  else:
-    copyMem(addr result, unsafeAddr x[0], bits div 8)
+  var accum: Word
+  var accumBits: int
+  var dstIdx: int
 
-  if endian != system.cpuEndian:
-    result = swapBytes(result)
+  when cpuEndian == littleEndian: # src is bigEndian, CPU is little-endian
+    dstIdx = 0
 
-func fromBytes*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: openArray[byte],
-    endian: Endianness = system.cpuEndian): T {.inline.} =
-  # TODO fromBytesBE in io.nim handles this better, merge the two!
-  var tmp: array[bits div 8, byte]
-  if x.len < tmp.len:
-    let offset = if endian == bigEndian: tmp.len - x.len else: 0
-    for i in 0..<x.len: # Loop since vm can't copymem
-      tmp[i + offset] = x[i]
-  else:
-    for i in 0..<tmp.len: # Loop since vm can't copymem
-      tmp[i] = x[i]
-  fromBytes(T, tmp, endian)
+    for srcIdx in countdown(x.len-1, 0):
+      let srcByte = x[srcIdx]
 
-func fromBytesBE*[bits: static int](
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        inc dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx < result.limbs.len:
+      result.limbs[dstIdx] = accum
+      for fillIdx in dstIdx+1 ..< result.limbs.len:
+        result.limbs[fillIdx] = 0
+  else:                          # src and CPU are bigEndian
+    dstIdx = result.limbs.len-1
+
+    for srcIdx in countdown(x.len-1, 0):
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        dec dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx > 0:
+      result.limbs[dstIdx] = accum
+      for fillIdx in 0 ..< dstIdx:
+        result.limbs[fillIdx] = 0
+
+func fromBytesLE*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: array[bits div 8, byte]): T {.inline.} =
-  ## Read big endian bytes and convert to an integer. By default, native
-  ## endianess is used which is not
-  ## portable!
-  fromBytes(T, x, bigEndian)
+    x: openArray[byte]): T =
+  ## Read little endian bytes and convert to an integer. At runtime, v must
+  ## contain at least sizeof(T) bytes. By default, native endianess is used
+  ## which is not portable! (i.e. use fixed-endian byte array or hex for serialization)
 
-func fromBytesBE*[bits: static int](
+  var accum: Word
+  var accumBits: int
+  var dstIdx: int
+
+  when cpuEndian == littleEndian: # src and CPU are little-endian
+    dstIdx = 0
+
+    for srcIdx in 0 ..< x.len:
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        inc dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx < result.limbs.len:
+      result.limbs[dstIdx] = accum
+      for fillIdx in dstIdx+1 ..< result.limbs.len:
+        result.limbs[fillIdx] = 0
+  else:                          # src is little endian, CPU is bigEndian
+    dstIdx = result.limbs.len-1
+
+    for srcIdx in 0 ..< x.len:
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        dec dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx > 0:
+      result.limbs[dstIdx] = accum
+      for fillIdx in 0 ..< dstIdx:
+        result.limbs[fillIdx] = 0
+
+func fromBytes*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: openArray[byte]): T {.inline.} =
-  ## Read big endian bytes and convert to an integer. At runtime, v must contain
-  ## at least sizeof(T) bytes. By default, native endianess is used which is not
-  ## portable!
-  fromBytes(T, x, bigEndian)
+    x: openarray[byte],
+    srcEndian: Endianness = system.cpuEndian): T {.inline.} =
+  ## Read an source bytearray with the specified endianness and
+  ## convert it to an integer
+  when srcEndian == littleEndian:
+    result = fromBytesLE(T, x)
+  else:
+    result = fromBytesBE(T, x)
+
+# TODO: What is the use-case for all the procs below?
+# ------------------------------------------------------------------------------------------
 
-func toBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
+func toBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use toByteArrayBE instead".} =
   ## Convert a native endian value to big endian. Consider toBytesBE instead
   ## which may prevent some confusion.
   if cpuEndian == bigEndian: x
   else: x.swapBytes
 
-func fromBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
+func fromBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use fromBytesBE instead".} =
   ## Read a big endian value and return the corresponding native endian
   # there's no difference between this and toBE, except when reading the code
   toBE(x)
 
-func fromBytesLE*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: array[bits div 8, byte]): StUint[bits] {.inline.} =
-  ## Read little endian bytes and convert to an integer. By default, native
-  ## endianess is used which is not portable!
-  fromBytes(T, x, littleEndian)
-
-func fromBytesLE*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: openArray[byte]): StUint[bits] {.inline.} =
-  ## Read little endian bytes and convert to an integer. At runtime, v must
-  ## contain at least sizeof(T) bytes. By default, native endianess is used
-  ## which is not portable!
-  fromBytes(T, x, littleEndian)
-
-func toLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
+func toLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated.} =
   ## Convert a native endian value to little endian. Consider toBytesLE instead
   ## which may prevent some confusion.
   if cpuEndian == littleEndian: x
   else: x.swapBytes
 
-func fromLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
+func fromLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use fromBytesLE instead".} =
   ## Read a little endian value and return the corresponding native endian
   # there's no difference between this and toLE, except when reading the code
   toLE(x)
diff --git a/stint/io.nim b/stint/io.nim
index a512c05..26bca80 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -9,9 +9,10 @@
 
 import
   ./private/datatypes,
-  ./private/int_negabs,
-  ./private/compiletime_helpers,
-  ./intops,
+  # ./private/int_negabs,
+  # ./private/compiletime_helpers,
+  # ./intops,
+  ./uintops, ./endians2,
   typetraits, algorithm, hashes
 
 template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
@@ -24,178 +25,144 @@ template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
             "\nUse a smaller input type instead. This is a compile-time check" &
             " to avoid a costly run-time bit_length check at each StUint initialization."
 
-func assignLo(result: var (UintImpl | IntImpl), n: SomeInteger) {.inline.} =
-  when result.lo is UintImpl:
-    assignLo(result.lo, n)
-  else:
-    result.lo = (type result.lo)(n)
-
 func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
   ## Converts an integer to an arbitrary precision integer.
-
-  doAssert n >= 0.T
-  when result.data is UintImpl:
-    static_check_size(T, bits)
-    assignLo(result.data, n)
-  else:
-    result.data = (type result.data)(n)
-
-func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
-  ## Converts an integer to an arbitrary precision signed integer.
-
-  when result.data is IntImpl:
-    static_check_size(T, bits)
-    when T is SomeSignedInt:
-      if n < 0:
-        # TODO: when bits >= 128, cannot create from
-        # low(int8-64)
-        # see: status-im/nim-stint/issues/92
-        assignLo(result.data, -n)
-        result = -result
-      else:
-        assignLo(result.data, n)
-    else:
-      assignLo(result.data, n)
+  when cpuEndian == littleEndian:
+    result.limbs[0] = Word(n)
+    when sizeof(n) > sizeof(Word):
+      result.limbs[1] = Word(n) shr WordBitWidth
   else:
-    result.data = (type result.data)(n)
+    result.limbs[^1] = Word(n)
+    when sizeof(n) > sizeof(Word):
+      result.limbs[^2] = Word(n) shr WordBitWidth
 
+<<<<<<< HEAD
 func to*(x: SomeInteger, T: typedesc[StInt]): T =
   stint(x, result.bits)
 
 func to*(x: SomeUnsignedInt, T: typedesc[StUint]): T =
   stuint(x, result.bits)
+=======
+# func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
+#   ## Converts an integer to an arbitrary precision signed integer.
+#
+#   when result.data is IntImpl:
+#     static_check_size(T, bits)
+#     when T is SomeSignedInt:
+#       if n < 0:
+#         # TODO: when bits >= 128, cannot create from
+#         # low(int8-64)
+#         # see: status-im/nim-stint/issues/92
+#         assignLo(result.data, -n)
+#         result = -result
+#       else:
+#         assignLo(result.data, n)
+#     else:
+#       assignLo(result.data, n)
+#   else:
+#     result.data = (type result.data)(n)
+
+# func to*(a: SomeInteger, T: typedesc[Stint]): T =
+#   stint(a, result.bits)
+
+func to*(a: SomeUnsignedInt, T: typedesc[StUint]): T =
+  stuint(a, result.bits)
 
 func truncate*(num: StInt or StUint, T: typedesc[SomeInteger]): T {.inline.}=
   ## Extract the int, uint, int8-int64 or uint8-uint64 portion of a multi-precision integer.
   ## Note that int and uint are 32-bit on 32-bit platform.
   ## For unsigned result type, result is modulo 2^(sizeof T in bit)
   ## For signed result type, result is undefined if input does not fit in the target type.
-  static:
-    doAssert bitsof(T) <= bitsof(num.data.leastSignificantWord)
-
-  when nimvm:
-    let data = num.data.leastSignificantWord
-    vmIntCast[T](data)
-  else:
-    cast[T](num.data.leastSignificantWord)
+  result = T(num.leastSignificantWord())
 
 func toInt*(num: StInt or StUint): int {.inline, deprecated:"Use num.truncate(int) instead".}=
   num.truncate(int)
 
-func bigToSmall(result: var (UintImpl | IntImpl), x: auto) {.inline.} =
-  when bitsof(x) == bitsof(result):
-    when type(result) is type(x):
-      result = x
-    else:
-      result = convert[type(result)](x)
-  else:
-    bigToSmall(result, x.lo)
-
-func smallToBig(result: var (UintImpl | IntImpl), x: auto) {.inline.} =
-  when bitsof(x) == bitsof(result):
-    when type(result) is type(x):
-      result = x
-    else:
-      result = convert[type(result)](x)
-  else:
-    smallToBig(result.lo, x)
-
-func stuint*(x: StUint, bits: static[int]): StUint[bits] {.inline.} =
+func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
   ## unsigned int to unsigned int conversion
   ## smaller to bigger bits conversion will have the same value
   ## bigger to smaller bits conversion, the result is truncated
-  const N = bitsof(x.data)
-  when N < bits:
-    when N <= 64:
-      result = stuint(x.data, bits)
-    else:
-      smallToBig(result.data, x.data)
-  elif N > bits:
-    when bits <= 64:
-      result = stuint(x.truncate(type(result.data)), bits)
-    else:
-      bigToSmall(result.data, x.data)
-  else:
-    result = x
-
-func stuint*(x: StInt, bits: static[int]): StUint[bits] {.inline.} =
-  ## signed int to unsigned int conversion
-  ## current behavior is cast-like, copying bit pattern
-  ## or truncating if input does not fit into destination
-  const N = bitsof(x.data)
-  when N < bits:
-    when N <= 64:
-      type T = StUint[N]
-      result = stuint(convert[T](x).data, bits)
-    else:
-      smallToBig(result.data, x.data)
-  elif N > bits:
-    when bits <= 64:
-      result = stuint(x.truncate(type(result.data)), bits)
-    else:
-      bigToSmall(result.data, x.data)
-  else:
-    result = convert[type(result)](x)
-
-func stint*(x: StInt, bits: static[int]): StInt[bits] {.inline.} =
-  ## signed int to signed int conversion
-  ## will raise exception if input does not fit into destination
-  const N = bitsof(x.data)
-  when N < bits:
-    when N <= 64:
-      result = stint(x.data, bits)
-    else:
-      if x.isNegative:
-        smallToBig(result.data, (-x).data)
-        result = -result
-      else:
-        smallToBig(result.data, x.data)
-  elif N > bits:
-    template checkNegativeRange() =
-      # due to bug #92, we skip negative range check
-      when false:
-        const dmin = stint((type result).low, N)
-        if x < dmin: raise newException(ValueError, "value out of range")
-
-    template checkPositiveRange() =
-      const dmax = stint((type result).high, N)
-      if x > dmax: raise newException(ValueError, "value out of range")
-
-    when bits <= 64:
-      if x.isNegative:
-        checkNegativeRange()
-        result = stint((-x).truncate(type(result.data)), bits)
-        result = -result
-      else:
-        checkPositiveRange()
-        result = stint(x.truncate(type(result.data)), bits)
-    else:
-      if x.isNegative:
-        checkNegativeRange()
-        bigToSmall(result.data, (-x).data)
-        result = -result
-      else:
-        checkPositiveRange()
-        bigToSmall(result.data, x.data)
-  else:
-    result = x
-
-func stint*(x: StUint, bits: static[int]): StInt[bits] {.inline.} =
-  const N = bitsof(x.data)
-  const dmax = stuint((type result).high, N)
-  if x > dmax: raise newException(ValueError, "value out of range")
-  when N < bits:
-    when N <= 64:
-      result = stint(x.data, bits)
-    else:
-      smallToBig(result.data, x.data)
-  elif N > bits:
-    when bits <= 64:
-      result = stint(x.truncate(type(result.data)), bits)
-    else:
-      bigToSmall(result.data, x.data)
-  else:
-    result = convert[type(result)](x)
+  for wr, wa in leastToMostSig(result, a):
+    wr = wa
+
+# func stuint*(a: StInt, bits: static[int]): StUint[bits] {.inline.} =
+#   ## signed int to unsigned int conversion
+#   ## current behavior is cast-like, copying bit pattern
+#   ## or truncating if input does not fit into destination
+#   const N = bitsof(x.data)
+#   when N < bits:
+#     when N <= 64:
+#       type T = StUint[N]
+#       result = stuint(convert[T](a).data, bits)
+#     else:
+#       smallToBig(result.data, a.data)
+#   elif N > bits:
+#     when bits <= 64:
+#       result = stuint(x.truncate(type(result.data)), bits)
+#     else:
+#       bigToSmall(result.data, a.data)
+#   else:
+#     result = convert[type(result)](a)
+
+# func stint*(a: StInt, bits: static[int]): StInt[bits] {.inline.} =
+#   ## signed int to signed int conversion
+#   ## will raise exception if input does not fit into destination
+#   const N = bitsof(a.data)
+#   when N < bits:
+#     when N <= 64:
+#       result = stint(a.data, bits)
+#     else:
+#       if a.isNegative:
+#         smallToBig(result.data, (-a).data)
+#         result = -result
+#       else:
+#         smallToBig(result.data, a.data)
+#   elif N > bits:
+#     template checkNegativeRange() =
+#       # due to bug #92, we skip negative range check
+#       when false:
+#         const dmin = stint((type result).low, N)
+#         if a < dmin: raise newException(RangeError, "value out of range")
+
+#     template checkPositiveRange() =
+#       const dmax = stint((type result).high, N)
+#       if a > dmax: raise newException(RangeError, "value out of range")
+
+#     when bits <= 64:
+#       if a.isNegative:
+#         checkNegativeRange()
+#         result = stint((-a).truncate(type(result.data)), bits)
+#         result = -result
+#       else:
+#         checkPositiveRange()
+#         result = stint(a.truncate(type(result.data)), bits)
+#     else:
+#       if a.isNegative:
+#         checkNegativeRange()
+#         bigToSmall(result.data, (-a).data)
+#         result = -result
+#       else:
+#         checkPositiveRange()
+#         bigToSmall(result.data, a.data)
+#   else:
+#     result = a
+
+# func stint*(a: StUint, bits: static[int]): StInt[bits] {.inline.} =
+#   const N = bitsof(a.data)
+#   const dmax = stuint((type result).high, N)
+#   if a > dmax: raise newException(RangeError, "value out of range")
+#   when N < bits:
+#     when N <= 64:
+#       result = stint(a.data, bits)
+#     else:
+#       smallToBig(result.data, a.data)
+#   elif N > bits:
+#     when bits <= 64:
+#       result = stint(a.truncate(type(result.data)), bits)
+#     else:
+#       bigToSmall(result.data, a.data)
+#   else:
+#     result = convert[type(result)](a)
 
 func readHexChar(c: char): int8 {.inline.}=
   ## Converts an hex char to an int
@@ -270,43 +237,43 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
       result = result * base + input[curr].readHexChar.stuint(bits)
     nextNonBlank(curr, input)
 
-func parse*[bits: static[int]](input: string, T: typedesc[StInt[bits]], radix: static[int8] = 10): T =
-  ## Parse a string and store the result in a StInt[bits] or StUint[bits].
+# func parse*[bits: static[int]](input: string, T: typedesc[Stint[bits]], radix: static[int8] = 10): T =
+#   ## Parse a string and store the result in a Stint[bits] or Stuint[bits].
 
-  static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
-  # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
+#   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
+#   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
 
-  # TODO: we can special case hex result/input as an array of bytes
-  #       and be much faster
+#   # TODO: we can special case hex result/input as an array of bytes
+#   #       and be much faster
 
-  # For conversion we require overflowing operations (for example for negative hex numbers)
-  const base = radix.int8.stuint(bits)
+#   # For conversion we require overflowing operations (for example for negative hex numbers)
+#   const base = radix.int8.stuint(bits)
 
-  var
-    curr = 0 # Current index in the string
-    isNeg = false
-    no_overflow: StUint[bits]
+#   var
+#     curr = 0 # Current index in the string
+#     isNeg = false
+#     no_overflow: Stuint[bits]
 
-  if input[curr] == '-':
-    doAssert radix == 10, "Negative numbers are only supported with base 10 input."
-    isNeg = true
-    inc curr
-  else:
-    skipPrefixes(curr, input, radix)
+#   if input[curr] == '-':
+#     doAssert radix == 10, "Negative numbers are only supported with base 10 input."
+#     isNeg = true
+#     inc curr
+#   else:
+#     skipPrefixes(curr, input, radix)
 
-  while curr < input.len:
-    # TODO: overflow detection
-    when radix <= 10:
-      no_overflow = no_overflow * base + input[curr].readDecChar.stuint(bits)
-    else:
-      no_overflow = no_overflow * base + input[curr].readHexChar.stuint(bits)
-    nextNonBlank(curr, input)
+#   while curr < input.len:
+#     # TODO: overflow detection
+#     when radix <= 10:
+#       no_overflow = no_overflow * base + input[curr].readDecChar.stuint(bits)
+#     else:
+#       no_overflow = no_overflow * base + input[curr].readHexChar.stuint(bits)
+#     nextNonBlank(curr, input)
 
-  # TODO: we can't create the lowest int this way
-  if isNeg:
-    result = -convert[T](no_overflow)
-  else:
-    result = convert[T](no_overflow)
+#   # TODO: we can't create the lowest int this way
+#   if isNeg:
+#     result = -convert[T](no_overflow)
+#   else:
+#     result = convert[T](no_overflow)
 
 func fromHex*(T: typedesc[StUint|StInt], s: string): T {.inline.} =
   ## Convert an hex string to the corresponding unsigned integer
@@ -316,119 +283,121 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
   ## Convert an hex string to the corresponding unsigned integer
   parse(hexString, type result, radix = 16)
 
-func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10): string =
-  ## Convert a StInt or StUint to string.
-  ## In case of negative numbers:
-  ##   - they are prefixed with "-" for base 10.
-  ##   - if not base 10, they are returned raw in two-complement form.
-
-  static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
-  # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
-
-  const hexChars = "0123456789abcdef"
-  const base = radix.uint8.stuint(bits)
-
-  result = ""
-  var (q, r) = divmod(num, base)
-
-  while true:
-    when bitsof(r.data) <= 64:
-      result.add hexChars[r.data.int]
-    else:
-      result.add hexChars[r.truncate(int)]
-    if q.isZero:
-      break
-    (q, r) = divmod(q, base)
-
-  reverse(result)
-
-func toString*[bits: static[int]](num: StInt[bits], radix: static[int8] = 10): string =
-  ## Convert a StInt or StUint to string.
-  ## In case of negative numbers:
-  ##   - they are prefixed with "-" for base 10.
-  ##   - if not base 10, they are returned raw in two-complement form.
-
-  static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
-  # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
-
-  const hexChars = "0123456789abcdef"
-  const base = radix.int8.stuint(bits)
-
-  result = ""
-
-  type T = StUint[bits]
-  let isNeg = num.isNegative
-  let num = convert[T](if radix == 10 and isNeg: -num
-            else: num)
-
-  var (q, r) = divmod(num, base)
-
-  while true:
-    when bitsof(r.data) <= 64:
-      result.add hexChars[r.data.int]
-    else:
-      result.add hexChars[r.truncate(int)]
-    if q.isZero:
-      break
-    (q, r) = divmod(q, base)
-
-  if isNeg and radix == 10:
-    result.add '-'
-
-  reverse(result)
-
-func `$`*(num: StInt or StUint): string {.inline.}=
-  when num.data is SomeInteger:
-    $num.data
-  else:
-    toString(num, 10)
-
-func toHex*[bits: static[int]](num: StInt[bits] or StUint[bits]): string {.inline.}=
-  ## Convert to a hex string.
-  ## Output is considered a big-endian base 16 string.
-  ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
-  toString(num, 16)
-
-func dumpHex*(x: StInt or StUint, order: static[Endianness] = bigEndian): string =
-  ## Stringify an int to hex.
-  ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
-  ##
-  ## You can specify bigEndian or littleEndian order.
-  ## i.e. in bigEndian:
-  ## - 1.uint64 will be 00000001
-  ## - (2.uint128)^64 + 1 will be 0000000100000001
-  ##
-  ## in littleEndian:
-  ## - 1.uint64 will be 01000000
-  ## - (2.uint128)^64 + 1 will be 0100000001000000
-
-  const
-    hexChars = "0123456789abcdef"
-    size = bitsof(x.data) div 8
-
-  result = newString(2*size)
-
-  when nimvm:
-    for i in 0 ..< size:
-      when order == system.cpuEndian:
-        let byte = x.data.getByte(i)
-      else:
-        let byte = x.data.getByte(size - 1 - i)
-      result[2*i] = hexChars[int byte shr 4 and 0xF]
-      result[2*i+1] = hexChars[int byte and 0xF]
-  else:
-    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-    let bytes {.restrict.}= cast[ptr array[size, byte]](x.unsafeAddr)
-
-    for i in 0 ..< size:
-      when order == system.cpuEndian:
-        result[2*i] = hexChars[int bytes[i] shr 4 and 0xF]
-        result[2*i+1] = hexChars[int bytes[i] and 0xF]
-      else:
-        result[2*i] = hexChars[int bytes[bytes[].high - i] shr 4 and 0xF]
-        result[2*i+1] = hexChars[int bytes[bytes[].high - i] and 0xF]
-
-proc initFromBytesBE*[bits: static[int]](val: var StUint[bits], ba: openArray[byte], allowPadding: static[bool] = true) =
+# func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10): string =
+#   ## Convert a Stint or Stuint to string.
+#   ## In case of negative numbers:
+#   ##   - they are prefixed with "-" for base 10.
+#   ##   - if not base 10, they are returned raw in two-complement form.
+
+#   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
+#   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
+
+#   const hexChars = "0123456789abcdef"
+#   const base = radix.uint8.stuint(bits)
+
+#   result = ""
+#   var (q, r) = divmod(num, base)
+
+#   while true:
+#     when bitsof(r.data) <= 64:
+#       result.add hexChars[r.data.int]
+#     else:
+#       result.add hexChars[r.truncate(int)]
+#     if q.isZero:
+#       break
+#     (q, r) = divmod(q, base)
+
+#   reverse(result)
+
+# func toString*[bits: static[int]](num: Stint[bits], radix: static[int8] = 10): string =
+#   ## Convert a Stint or Stuint to string.
+#   ## In case of negative numbers:
+#   ##   - they are prefixed with "-" for base 10.
+#   ##   - if not base 10, they are returned raw in two-complement form.
+
+#   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
+#   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
+
+#   const hexChars = "0123456789abcdef"
+#   const base = radix.int8.stuint(bits)
+
+#   result = ""
+
+#   type T = Stuint[bits]
+#   let isNeg = num.isNegative
+#   let num = convert[T](if radix == 10 and isNeg: -num
+#             else: num)
+
+#   var (q, r) = divmod(num, base)
+
+#   while true:
+#     when bitsof(r.data) <= 64:
+#       result.add hexChars[r.data.int]
+#     else:
+#       result.add hexChars[r.truncate(int)]
+#     if q.isZero:
+#       break
+#     (q, r) = divmod(q, base)
+
+#   if isNeg and radix == 10:
+#     result.add '-'
+
+#   reverse(result)
+
+# func `$`*(num: Stint or StUint): string {.inline.}=
+#   when num.data is SomeInteger:
+#     $num.data
+#   else:
+#     toString(num, 10)
+
+# func toHex*[bits: static[int]](num: Stint[bits] or StUint[bits]): string {.inline.}=
+#   ## Convert to a hex string.
+#   ## Output is considered a big-endian base 16 string.
+#   ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
+#   toString(num, 16)
+
+# func dumpHex*(x: Stint or StUint, order: static[Endianness] = bigEndian): string =
+#   ## Stringify an int to hex.
+#   ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
+#   ##
+#   ## You can specify bigEndian or littleEndian order.
+#   ## i.e. in bigEndian:
+#   ## - 1.uint64 will be 00000001
+#   ## - (2.uint128)^64 + 1 will be 0000000100000001
+#   ##
+#   ## in littleEndian:
+#   ## - 1.uint64 will be 01000000
+#   ## - (2.uint128)^64 + 1 will be 0100000001000000
+
+#   const
+#     hexChars = "0123456789abcdef"
+#     size = bitsof(x.data) div 8
+
+#   result = newString(2*size)
+
+#   when nimvm:
+#     for i in 0 ..< size:
+#       when order == system.cpuEndian:
+#         let byte = x.data.getByte(i)
+#       else:
+#         let byte = x.data.getByte(size - 1 - i)
+#       result[2*i] = hexChars[int byte shr 4 and 0xF]
+#       result[2*i+1] = hexChars[int byte and 0xF]
+#   else:
+#     {.pragma: restrict, codegenDecl: "$# __restrict $#".}
+#     let bytes {.restrict.}= cast[ptr array[size, byte]](x.unsafeaddr)
+
+#     for i in 0 ..< size:
+#       when order == system.cpuEndian:
+#         result[2*i] = hexChars[int bytes[i] shr 4 and 0xF]
+#         result[2*i+1] = hexChars[int bytes[i] and 0xF]
+#       else:
+#         result[2*i] = hexChars[int bytes[bytes[].high - i] shr 4 and 0xF]
+#         result[2*i+1] = hexChars[int bytes[bytes[].high - i] and 0xF]
+
+proc initFromBytesBE*[bits: static[int]](val: var Stuint[bits], 
+                      ba: openarray[byte], 
+                      allowPadding: static[bool] = true) {.deprecated:"Use fromBytesBE instead".}=
   ## Initializes a UInt[bits] value from a byte buffer storing a big-endian
   ## representation of a number.
   ##
@@ -484,48 +453,35 @@ func significantBytesBE*(val: openArray[byte]): int {.deprecated.}=
       return val.len - i
   return 1
 
-func fromBytesBE*(T: type StUint, ba: openArray[byte],
-                  allowPadding: static[bool] = true): T =
+func fromBytesBE*(T: type Stuint, ba: openarray[byte],
+                  allowPadding: static[bool] = true): T {.noInit, inline.} =
   ## This function provides a convenience wrapper around `initFromBytesBE`.
-  result.initFromBytesBE(ba, allowPadding)
+  when not allowPadding:
+    {.deprecated: "fromBytesBE without padding is deprecated".}
+    result.initFromBytesBE(ba, allowPadding)
+  else:
+    result = endians2.fromBytesBE(T, ba)
 
-func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] =
+func readUintBE*[bits: static[int]](ba: openarray[byte]): Stuint[bits] {.noInit, inline.}=
   ## Convert a big-endian array of (bits div 8) Bytes to an UInt[bits] (in native host endianness)
   ## Input:
   ##   - a big-endian openArray of size (bits div 8) at least
   ## Returns:
   ##   - A unsigned integer of the same size with `bits` bits
   ##
-  ## ⚠ If the openArray length is bigger than bits div 8, part converted is undefined behaviour.
-  result.initFromBytesBE(ba, false)
+  ## ⚠ If the openarray length is bigger than bits div 8, part converted is undefined behaviour.
+  result = endians2.fromBytesBE(Stuint[bits], ba)
 
-func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] =
+func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] {.noInit, inline.}=
   ## Convert a uint[bits] to to a big-endian array of bits div 8 bytes
   ## Input:
   ##   - an unsigned integer
   ## Returns:
   ##   - a big-endian array of the same size
-
-  const N = bits div 8
-
-  when nimvm:
-    for i in 0 ..< N:
-      when system.cpuEndian == bigEndian:
-        result[i] = n.data.getByte(i)
-      else:
-        result[i] = n.data.getByte(N - 1 - i)
-  else:
-    when system.cpuEndian == bigEndian:
-      result = cast[type result](n)
-    else:
-      {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-      let n_ptr {.restrict.} = cast[ptr array[N, byte]](n.unsafeAddr)
-      for i in 0 ..< N:
-        result[N-1 - i] = n_ptr[i]
+  result = n.toBytes(bigEndian)
 
 template hash*(num: StUint|StInt): Hash =
   # TODO:
   # `hashData` is not particularly efficient.
   # Explore better hashing solutions in nim-stew.
   hashData(unsafeAddr num, sizeof num)
-
diff --git a/stint/private/bitops2_priv.nim b/stint/private/bitops2_priv.nim
deleted file mode 100644
index 0c2479f..0000000
--- a/stint/private/bitops2_priv.nim
+++ /dev/null
@@ -1,58 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes, ./conversion, stew/bitops2
-export bitops2
-
-# Bitops from support library
-
-template bitsof*(x: UintImpl): int =
-  # XXX: https://github.com/nim-lang/Nim/issues/9494
-  mixin bitsof
-  bitsof(x.lo) * 2
-
-template bitsof*(x: IntImpl): int =
-  # XXX: https://github.com/nim-lang/Nim/issues/9494
-  mixin bitsof
-  bitsof(x.lo) * 2
-
-template bitsof*(x: typedesc[UintImpl]): int =
-  # XXX: https://github.com/nim-lang/Nim/issues/9494
-  mixin bitsof
-  bitsof(x.lo) * 2
-
-func countOnes*(x: UintImpl): int {.inline.} =
-  countOnes(x.lo) + countOnes(x.hi)
-
-func countZeros*(x: UintImpl): int {.inline.} =
-  countZeros(x.lo) + countOnes(x.hi)
-
-func parity*(x: UintImpl): int {.inline.} =
-  parity(x.lo) xor parity(x.hi)
-
-func leadingZeros*(x: UintImpl): int {.inline.} =
-  let tmp = x.hi.leadingZeros()
-  if tmp == bitsof(x.hi):
-    x.lo.leadingZeros() + bitsof(x.hi)
-  else:
-    tmp
-
-func trailingZeros*(x: UintImpl): int {.inline.} =
-  let tmp = x.lo.trailingZeros()
-  if tmp == bitsof(x.lo):
-    tmp + x.hi.trailingZeros()
-  else:
-    tmp
-
-func firstOne*(x: UintImpl): int {.inline.} =
-  let tmp = trailingZeros(x)
-  if tmp == bitsof(x):
-    0
-  else:
-    1 + tmp
diff --git a/stint/private/compiletime_helpers.nim b/stint/private/compiletime_helpers.nim
deleted file mode 100644
index e49e361..0000000
--- a/stint/private/compiletime_helpers.nim
+++ /dev/null
@@ -1,82 +0,0 @@
-import
-  ./datatypes, ./uint_bitwise_ops, ./bitops2_priv, ./int_bitwise_ops,
-  ./compiletime_cast
-
-export compiletime_cast
-
-func getByte*(x: SomeInteger, pos: int): byte {.compileTime.} =
-  type DT = type x
-  when bitsof(DT) == 8:
-    cast[byte](x)
-  else:
-    byte((x shr (pos * 8)) and 0xFF.DT)
-
-func getByte*(x: UintImpl | IntImpl, pos: int): byte {.compileTime.} =
-  type DT = type x.leastSignificantWord
-  when bitsof(DT) == 8:
-    cast[byte](x.leastSignificantWord)
-  else:
-    byte((x shr (pos * 8)).leastSignificantWord and 0xFF.DT)
-
-proc setByte*(x: var SomeInteger, pos: int, b: byte) {.compileTime.} =
-  type DT = type x
-  x = x or (DT(b) shl (pos*8))
-
-type SomeIntImpl = UintImpl | IntImpl
-func setByte*(x: var SomeIntImpl, pos: int, b: byte) {.compileTime.} =
-  proc putFirstByte(x: var SomeInteger, b: byte) =
-    type DT = type x
-    x = x or b.DT
-
-  proc putFirstByte(x: var UintImpl, b: byte) =
-    putFirstByte(x.lo, b)
-
-  var cx: type x
-  cx.putFirstByte(b)
-  x = x or (cx shl (pos*8))
-
-func copyToArray*(ret: var openArray[byte], x: UintImpl) {.compileTime.} =
-  const size = bitsof(x) div 8
-  doAssert ret.len >= size
-  for i in 0 ..< size:
-    ret[i] = x.getByte(i)
-
-func copyFromArray*(x: var UintImpl, data: openArray[byte]) {.compileTime.} =
-  const size = bitsof(x) div 8
-  doAssert data.len >= size
-  for i in 0 ..< size:
-    x.setByte(i, data[i])
-
-func copyFromArray*(x: var SomeInteger, data: openArray[byte]) {.compileTime.} =
-  const size = bitsof(x) div 8
-  doAssert data.len >= size
-  for i in 0 ..< size:
-    x.setByte(i, data[i])
-
-template vmIntCast*[T](data: SomeInteger): T =
-  type DT = type data
-  const
-    bits = bitsof(T)
-    DTbits = bitsof(DT)
-
-  # we use esoteric type juggling here to trick the Nim VM
-  when bits == 64:
-    when DTbits == 64:
-      cast[T](data)
-    else:
-      cast[T](uint64(data and DT(0xFFFFFFFF_FFFFFFFF)))
-  elif bits == 32:
-    when DTbits == 32:
-      cast[T](data)
-    else:
-      cast[T](uint32(data and DT(0xFFFFFFFF)))
-  elif bits == 16:
-    when DTbits == 16:
-      cast[T](data)
-    else:
-      cast[T](uint16(data and DT(0xFFFF)))
-  else:
-    when DTBits == 8:
-      cast[T](data)
-    else:
-      cast[T](uint8(data and DT(0xFF)))
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 4fa1cfb..6e4f27c 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -39,6 +39,8 @@ type
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
 
+  SomeBigInteger*[bits: static[int]] = Stuint[bits]|Stint[bits]
+
 const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
 const X86* = defined(amd64) or defined(i386)
 
@@ -46,65 +48,80 @@ when sizeof(int) == 8 and GCC_Compatible:
   type
     uint128*{.importc: "unsigned __int128".} = object
 
+# Accessors
+# --------------------------------------------------------
+
 template leastSignificantWord*(num: SomeInteger): auto =
   num
 
-func leastSignificantWord*(limbs: Limbs): auto {.inline.} =
+func leastSignificantWord*(a: SomeBigInteger): auto {.inline.} =
   when cpuEndian == littleEndian:
-    limbs[0]
+    a.limbs[0]
   else:
-    limbs[^1]
+    a.limbs[^1]
 
-func mostSignificantWord*(limbs: Limbs): auto {.inline.} =
+func mostSignificantWord*(a: SomeBigInteger): auto {.inline.} =
   when cpuEndian == littleEndian:
-    limbs[^1]
+    a.limbs[^1]
   else:
-    limbs[0]
+    a.limbs[0]
 
-iterator leastToMostSig*(limbs: Limbs): Word =
+# Iterations
+# --------------------------------------------------------
+
+iterator leastToMostSig*(a: SomeBigInteger): Word =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< limbs.len:
-      yield limbs[i]
+    for i in 0 ..< a.limbs.len:
+      yield a.limbs[i]
   else:
-    for i in countdown(limbs.len-1, 0):
-      yield limbs[i]
+    for i in countdown(a.limbs.len-1, 0):
+      yield a.limbs[i]
 
-iterator leastToMostSig*(limbs: var Limbs): var Word =
+iterator leastToMostSig*(a: var SomeBigInteger): var Word =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< limbs.len:
-      yield limbs[i]
+    for i in 0 ..< a.limbs.len:
+      yield a.limbs[i]
   else:
-    for i in countdown(limbs.len-1, 0):
-      yield limbs[i]
+    for i in countdown(a.limbs.len-1, 0):
+      yield a.limbs[i]
 
-iterator leastToMostSig*(aLimbs, bLimbs: Limbs): (Word, Word) =
+iterator leastToMostSig*(a, b: SomeBigInteger): (Word, Word) =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< aLimbs.len:
-      yield (aLimbs[i], bLimbs[i])
+    for i in 0 ..< a.limbs.len:
+      yield (a.limbs[i], b.limbs[i])
   else:
-    for i in countdown(aLimbs.len-1, 0):
-      yield (aLimbs[i], bLimbs[i])
+    for i in countdown(a.limbs.len-1, 0):
+      yield (a.limbs[i], b.limbs[i])
 
-iterator leastToMostSig*(aLimbs: var Limbs, bLimbs: Limbs): (var Word, Word) =
+iterator leastToMostSig*[aBits, bBits](a: var SomeBigInteger[aBits], b: SomeBigInteger[bBits]): (var Word, Word) =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< aLimbs.len:
-      yield (aLimbs[i], bLimbs[i])
+    for i in 0 ..< min(a.limbs.len, b.limbs.len):
+      yield (a.limbs[i], b.limbs[i])
   else:
-    for i in countdown(aLimbs.len-1, 0):
-      yield (aLimbs[i], bLimbs[i])
+    for i in countdown(min(aLimbs.len, b.limbs.len)-1, 0):
+      yield (a.limbs[i], b.limbs[i])
 
-iterator leastToMostSig*(cLimbs: var Limbs, aLimbs: Limbs, bLimbs: Limbs): (var Word, Word, Word) =
+iterator leastToMostSig*(c: var SomeBigInteger, a, b: SomeBigInteger): (var Word, Word, Word) =
   ## Iterate from least to most significant word
   when cpuEndian == littleEndian:
-    for i in 0 ..< aLimbs.len:
-      yield (cLimbs[i], aLimbs[i], bLimbs[i])
+    for i in 0 ..< a.limbs.len:
+      yield (c.limbs[i], a.limbs[i], b.limbs[i])
+  else:
+    for i in countdown(a.limbs.len-1, 0):
+      yield (c.limbs[i], a.limbs[i], b.limbs[i])
+
+iterator mostToLeastSig*(a: SomeBigInteger): Word =
+  ## Iterate from most to least significant word
+  when cpuEndian == bigEndian:
+    for i in 0 ..< a.limbs.len:
+      yield a.limbs[i]
   else:
-    for i in countdown(aLimbs.len-1, 0):
-      yield (cLimbs[i], aLimbs[i], bLimbs[i])
+    for i in countdown(a.limbs.len-1, 0):
+      yield a.limbs[i]
 
 import std/macros
 
diff --git a/stint/private/endians2_priv.nim b/stint/private/endians2_priv.nim
deleted file mode 100644
index fb02b1a..0000000
--- a/stint/private/endians2_priv.nim
+++ /dev/null
@@ -1,26 +0,0 @@
-import ./bitops2_priv, ./datatypes, ./compiletime_helpers
-import stew/endians2
-export endians2
-
-func swapBytes*(x: UintImpl): UintImpl {.inline.} =
-  let lo = swapBytes(x.hi)
-  let hi = swapBytes(x.lo)
-
-  UintImpl(hi: hi, lo: lo)
-
-func toBytes*(x: UintImpl, endian: Endianness = system.cpuEndian): auto {.inline.} =
-  # TODO can't use bitsof in return type (compiler bug?), hence return auto
-  var ret: array[bitsof(x) div 8, byte]
-  when nimvm:
-    if endian == system.cpuEndian:
-      copyToArray(ret, x)
-    else:
-      let v = swapBytes(x)
-      copyToArray(ret, v)
-  else:
-    if endian == system.cpuEndian:
-      copyMem(addr ret[0], unsafeAddr x, ret.len)
-    else:
-      let v = swapBytes(x)
-      copyMem(addr ret[0], unsafeAddr v, ret.len)
-  ret
diff --git a/stint/private/initialization.nim b/stint/private/initialization.nim
deleted file mode 100644
index 1c4b108..0000000
--- a/stint/private/initialization.nim
+++ /dev/null
@@ -1,19 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./datatypes
-
-func zero*(T: typedesc): T {.inline.} =
-  discard
-
-func one*(T: typedesc[SomeInteger]): T {.inline.} =
-  1
-
-func one*(T: typedesc[UintImpl or IntImpl]): T {.inline.} =
-  result.lo = one(type result.lo)
diff --git a/stint/private/primitives/addcarry_subborrow.nim b/stint/private/primitives/addcarry_subborrow.nim
index c4e27df..75aa21a 100644
--- a/stint/private/primitives/addcarry_subborrow.nim
+++ b/stint/private/primitives/addcarry_subborrow.nim
@@ -7,7 +7,7 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ../datatypes
+import ../datatypes, ./compiletime_fallback
 
 # ############################################################
 #
@@ -105,65 +105,82 @@ when X86:
 func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} =
   ## Addition with carry
   ## (CarryOut, Sum) <- a + b + CarryIn
-  when X86:
-    cOut = addcarry_u32(cIn, a, b, sum)
-  else:
+  when nimvm:
     let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
     sum = (uint32)(dblPrec)
     cOut = Carry(dblPrec shr 32)
+  else:
+    when X86:
+      cOut = addcarry_u32(cIn, a, b, sum)
+    else:
+      let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
+      sum = (uint32)(dblPrec)
+      cOut = Carry(dblPrec shr 32)
 
 func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
   ## Substraction with borrow
   ## (BorrowOut, Diff) <- a - b - borrowIn
-  when X86:
-    bOut = subborrow_u32(bIn, a, b, diff)
-  else:
+  when nimvm:
     let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
     diff = (uint32)(dblPrec)
     # On borrow the high word will be 0b1111...1111 and needs to be masked
     bOut = Borrow((dblPrec shr 32) and 1)
+  else:
+    when X86:
+      bOut = subborrow_u32(bIn, a, b, diff)
+    else:
+      let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
+      diff = (uint32)(dblPrec)
+      # On borrow the high word will be 0b1111...1111 and needs to be masked
+      bOut = Borrow((dblPrec shr 32) and 1)
 
 func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
   ## Addition with carry
   ## (CarryOut, Sum) <- a + b + CarryIn
-  when X86:
-    cOut = addcarry_u64(cIn, a, b, sum)
+  when nimvm:
+    addC_nim(cOut, sum, a, b, cIn)
   else:
-    block:
-      static:
-        doAssert GCC_Compatible
-        doAssert sizeof(int) == 8
-
-      var dblPrec {.noInit.}: uint128
-      {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
-
-      # Don't forget to dereference the var param in C mode
-      when defined(cpp):
-        {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
-        {.emit:[sum, " = (NU64)", dblPrec,";"].}
-      else:
-        {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
-        {.emit:["*",sum, " = (NU64)", dblPrec,";"].}
+    when X86:
+      cOut = addcarry_u64(cIn, a, b, sum)
+    else:
+      block:
+        static:
+          doAssert GCC_Compatible
+          doAssert sizeof(int) == 8
+
+        var dblPrec {.noInit.}: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
+
+        # Don't forget to dereference the var param in C mode
+        when defined(cpp):
+          {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+          {.emit:[sum, " = (NU64)", dblPrec,";"].}
+        else:
+          {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+          {.emit:["*",sum, " = (NU64)", dblPrec,";"].}
 
 func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
   ## Substraction with borrow
   ## (BorrowOut, Diff) <- a - b - borrowIn
-  when X86:
-    bOut = subborrow_u64(bIn, a, b, diff)
+  when nimvm:
+    subB_nim(bOut, diff, a, b, bIn)
   else:
-    block:
-      static:
-        doAssert GCC_Compatible
-        doAssert sizeof(int) == 8
-
-      var dblPrec {.noInit.}: uint128
-      {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
-
-      # Don't forget to dereference the var param in C mode
-      # On borrow the high word will be 0b1111...1111 and needs to be masked
-      when defined(cpp):
-        {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
-        {.emit:[diff, " = (NU64)", dblPrec,";"].}
-      else:
-        {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
-        {.emit:["*",diff, " = (NU64)", dblPrec,";"].}
+    when X86:
+      bOut = subborrow_u64(bIn, a, b, diff)
+    else:
+      block:
+        static:
+          doAssert GCC_Compatible
+          doAssert sizeof(int) == 8
+
+        var dblPrec {.noInit.}: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
+
+        # Don't forget to dereference the var param in C mode
+        # On borrow the high word will be 0b1111...1111 and needs to be masked
+        when defined(cpp):
+          {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+          {.emit:[diff, " = (NU64)", dblPrec,";"].}
+        else:
+          {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+          {.emit:["*",diff, " = (NU64)", dblPrec,";"].}
diff --git a/stint/private/primitives/compiletime_fallback.nim b/stint/private/primitives/compiletime_fallback.nim
new file mode 100644
index 0000000..35a15e0
--- /dev/null
+++ b/stint/private/primitives/compiletime_fallback.nim
@@ -0,0 +1,108 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#                     VM fallback
+#
+# ############################################################
+
+const
+  HalfWidth = WordBitWidth shr 1
+  HalfBase = (Word(1) shl HalfWidth)
+  HalfMask = HalfBase - 1
+
+func hi(n: Word): Word =
+  result = n shr HalfWidth
+
+func lo(n: Word): Word =
+  result = n and HalfMask
+
+func split(n: Word): tuple[hi, lo: Word] =
+  result.hi = n.hi
+  result.lo = n.lo
+
+func merge(hi, lo: Word): Word =
+  (hi shl HalfWidth) or lo
+
+func addC_nim*(cOut: var Carry, sum: var Word, a, b: Word, cIn: Carry) =
+  # Add with carry, fallback for the Compile-Time VM
+  # (CarryOut, Sum) <- a + b + CarryIn
+  let (aHi, aLo) = split(a)
+  let (bHi, bLo) = split(b)
+  let tLo = aLo + bLo + cIn
+  let (cLo, rLo) = split(tLo)
+  let tHi = aHi + bHi + cLo
+  let (cHi, rHi) = split(tHi)
+  cOut = Carry(cHi)
+  sum = merge(rHi, rLo)
+
+func subB_nim*(bOut: var Borrow, diff: var Word, a, b: Word, bIn: Borrow) =
+  # Substract with borrow, fallback for the Compile-Time VM
+  # (BorrowOut, Sum) <- a - b - BorrowIn
+  let (aHi, aLo) = split(a)
+  let (bHi, bLo) = split(b)
+  let tLo = HalfBase + aLo - bLo - bIn
+  let (noBorrowLo, rLo) = split(tLo)
+  let tHi = HalfBase + aHi - bHi - Word(noBorrowLo == 0)
+  let (noBorrowHi, rHi) = split(tHi)
+  bOut = Borrow(noBorrowHi == 0)
+  diff = merge(rHi, rLo)
+
+func mul_nim*(hi, lo: var Word, u, v: Word) =
+  ## Extended precision multiplication
+  ## (hi, lo) <- u * v
+  var x0, x1, x2, x3: Word
+
+  let
+    (uh, ul) = u.split()
+    (vh, vl) = v.split()
+
+  x0 = ul * vl
+  x1 = ul * vh
+  x2 = uh * vl
+  x3 = uh * vh
+
+  x1 += hi(x0)          # This can't carry
+  x1 += x2              # but this can
+  if x1 < x2:           # if carry, add it to x3
+    x3 += HalfBase
+
+  hi = x3 + hi(x1)
+  lo = merge(x1, lo(x0))
+
+func muladd1_nim*(hi, lo: var Word, a, b, c: Word) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  var carry: Carry
+  mul_nim(hi, lo, a, b)
+  addC_nim(carry, lo, lo, c, 0)
+  addC_nim(carry, hi, hi, 0, carry)
+
+func muladd2_nim*(hi, lo: var Word, a, b, c1, c2: Word) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  var carry1, carry2: Carry
+
+  mul_nim(hi, lo, a, b)
+  # Carry chain 1
+  addC_nim(carry1, lo, lo, c1, 0)
+  addC_nim(carry1, hi, hi, 0, carry1)
+  # Carry chain 2
+  addC_nim(carry2, lo, lo, c2, 0)
+  addC_nim(carry2, hi, hi, 0, carry2)
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
deleted file mode 100644
index c99795e..0000000
--- a/stint/private/uint_addsub.nim
+++ /dev/null
@@ -1,46 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./datatypes,
-  ./primitives/addcarry_subborrow
-
-# ############ Addition & Substraction ############ #
-{.push raises: [], inline, noInit, gcsafe.}
-
-func `+`*(x, y: Limbs): Limbs =
-  # Addition for multi-precision unsigned int
-  var carry = Carry(0)
-  for wr, wx, wy in leastToMostSig(result, x, y):
-    addC(carry, wr, wx, wy, carry)
-
-func `+=`*(x: var Limbs, y: Limbs) =
-  ## In-place addition for multi-precision unsigned int
-  var carry = Carry(0)
-  for wx, wy in leastToMostSig(x, y):
-    addC(carry, wx, wx, wy, carry)
-
-func `-`*(x, y: Limbs): Limbs =
-  # Substraction for multi-precision unsigned int
-  var borrow = Borrow(0)
-  for wr, wx, wy in leastToMostSig(result, x, y):
-    subB(borrow, wr, wx, wy, borrow)
-
-func `-=`*(x: var Limbs, y: Limbs) =
-  ## In-place substraction for multi-precision unsigned int
-  var borrow = Borrow(0)
-  for wx, wy in leastToMostSig(x, y):
-    subB(borrow, wx, wx, wy, borrow)
-
-func inc*(x: var Limbs, w: SomeUnsignedInt = 1) =
-  var carry = Carry(0)
-  when cpuEndian == littleEndian:
-    addC(carry, x[0], x[0], w, carry)
-    for i in 1 ..< x.len:
-      addC(carry, x[i], x[i], 0, carry)
diff --git a/stint/private/uint_bitwise_ops.nim b/stint/private/uint_bitwise_ops.nim
deleted file mode 100644
index 0afff9f..0000000
--- a/stint/private/uint_bitwise_ops.nim
+++ /dev/null
@@ -1,60 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes
-
-{.push raises: [], inline, noInit, gcsafe.}
-
-func `not`*(x: Limbs): Limbs =
-  ## Bitwise complement of unsigned integer x
-  for wr, wx in leastToMostSig(result, x):
-    wr = not wx
-
-func `or`*(x, y: Limbs): Limbs =
-  ## `Bitwise or` of numbers x and y
-  for wr, wx, wy in leastToMostSig(result, x, y):
-    wr = wx or wy
-
-func `and`*(x, y: Limbs): Limbs =
-  ## `Bitwise and` of numbers x and y
-  for wr, wx, wy in leastToMostSig(result, x, y):
-    wr = wx and wy
-
-func `xor`*(x, y: Limbs): Limbs =
-  ## `Bitwise xor` of numbers x and y
-  for wr, wx, wy in leastToMostSig(result, x, y):
-    wr = wx xor wy
-
-func `shr`*(x: Limbs, k: SomeInteger): Limbs =
-  ## Shift right by k.
-  ##
-  ## k MUST be less than the base word size (2^32 or 2^64)
-  # Note: for speed, loading a[i] and a[i+1]
-  #       instead of a[i-1] and a[i]
-  #       is probably easier to parallelize for the compiler
-  #       (antidependence WAR vs loop-carried dependence RAW)
-  when cpuEndian == littleEndian:
-    for i in 0 ..< x.len-1:
-      result[i] = (x[i] shr k) or (x[i+1] shl (WordBitWidth - k))
-    result[^1] = x[^1] shr k
-  else:
-    for i in countdown(x.len-1, 1):
-      result[i] = (x[i] shr k) or (x[i-1] shl (WordBitWidth - k))
-    result[0] = x[0] shr k
-
-func `shl`*(x: Limbs, k: SomeInteger): Limbs =
-  ## Compute the `shift left` operation of x and k
-  when cpuEndian == littleEndian:
-    result[0] = x[0] shl k
-    for i in 1 ..< x.len:
-      result[i] = (x[i] shl k) or (x[i-1] shr (WordBitWidth - k))
-  else:
-    result[^1] = x[^1] shl k
-    for i in countdown(x.len-2, 0):
-      result[i] = (x[i] shl k) or (x[i+1] shr (WordBitWidth - k))
diff --git a/stint/private/uint_comparison.nim b/stint/private/uint_comparison.nim
deleted file mode 100644
index 48832da..0000000
--- a/stint/private/uint_comparison.nim
+++ /dev/null
@@ -1,54 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./datatypes,
-  ./primitives/addcarry_subborrow
-
-{.push raises: [], inline, noInit, gcsafe.}
-
-func isZero*(n: SomeUnsignedInt): bool =
-  n == 0
-
-func isZero*(limbs: Limbs): bool =
-  for word in limbs:
-    if not word.isZero():
-      return false
-  return true
-
-func `<`*(x, y: Limbs): bool =
-  # Lower comparison for multi-precision integers
-  var diff: Word
-  var borrow: Borrow
-  for wx, wy in leastToMostSig(x, y):
-    subB(borrow, diff, wx, wy, borrow)
-  return bool(borrow)
-
-func `==`*(x, y: Limbs): bool =
-  # Equal comparison for multi-precision integers
-  for wx, wy in leastToMostSig(x, y):
-    if wx != wy:
-      return false
-  return true
-
-func `<=`*(x, y: Limbs): bool =
-  # Lower or equal comparison for multi-precision integers
-  not(y < x)
-
-func isEven*(x: SomeUnsignedInt): bool =
-  (x and 1) == 0
-
-func isEven*(x: Limbs): bool =
-  x.leastSignificantWord.isEven
-
-func isOdd*(x: SomeUnsignedInt): bool =
-  not x.isEven
-
-func isOdd*(x: Limbs): bool =
-  not x.isEven
diff --git a/stint/private/uint_highlow.nim b/stint/private/uint_highlow.nim
deleted file mode 100644
index b70af0f..0000000
--- a/stint/private/uint_highlow.nim
+++ /dev/null
@@ -1,16 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./datatypes, ./initialization, ./uint_bitwise_ops
-
-func low*(T: typedesc[UintImpl]): T {.inline.}=
-  zero(T)
-
-func high*(T: typedesc[UintImpl]): T {.inline.}=
-  not zero(T)
diff --git a/stint/private/uint_mul.nim b/stint/private/uint_mul.nim
index 022cef5..2b574f8 100644
--- a/stint/private/uint_mul.nim
+++ b/stint/private/uint_mul.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-Present Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -14,7 +14,7 @@ import
 # ################### Multiplication ################### #
 {.push raises: [], gcsafe.}
 
-func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
   ## Multi-precision multiplication
   ## r <- a*b
   ##
@@ -25,7 +25,7 @@ func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen])
 
   # We use Product Scanning / Comba multiplication
   var t, u, v = Word(0)
-  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+  var z: typeof(r) # zero-init, ensure on stack and removes in-place problems
 
   staticFor i, 0, min(a.len+b.len, r.len):
     const ib = min(b.len-1, i)
@@ -36,11 +36,11 @@ func prod*[rLen, aLen, bLen](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen])
     z[i] = v
     v = u
     u = t
-    t = Word(0)
+    t = 0
 
   r = z
 
-func prod_high_words*[rLen, aLen, bLen](
+func prod_high_words*[rLen, aLen, bLen: static int](
        r: var Limbs[rLen],
        a: Limbs[aLen], b: Limbs[bLen],
        lowestWordIndex: static int) =
diff --git a/stint/uintops.nim b/stint/uintops.nim
new file mode 100644
index 0000000..870ebe4
--- /dev/null
+++ b/stint/uintops.nim
@@ -0,0 +1,230 @@
+# Stint
+# Copyright 2018-2020 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./private/datatypes,
+  ./private/primitives/addcarry_subborrow
+
+export StUint
+
+# Initialization
+# --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func setZero*(a: var StUint) =
+  ## Set ``a`` to 0
+  zeroMem(a[0].addr, sizeof(a))
+
+func setOne*(a: var StUint) =
+  ## Set ``a`` to 1
+  when cpuEndian == littleEndian:
+    a.limbs[0] = 1
+    when a.limbs.len > 1:
+      zeroMem(a.limbs[1].addr, (a.limbs.len - 1) * sizeof(SecretWord))
+  else:
+    a.limbs[^1] = 1
+    when a.limbs.len > 1:
+      zeroMem(a.limbs[0].addr, (a.len - 1) * sizeof(SecretWord))
+
+func zero*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
+  ## Returns the zero of the input type
+  discard
+
+func one*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
+  ## Returns the one of the input type
+  result.limbs.setOne()
+
+func high*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
+  for wr in leastToMostSig(result):
+    wr = high(Word)
+func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
+  discard
+
+{.pop.}
+# Comparisons
+# --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func isZero*(a: Stuint): bool =
+  for word in leastToMostSig(a):
+    if word != 0:
+      return false
+  return true
+
+func `==`*(a, b: Stuint): bool {.inline.} =
+  ## Unsigned `equal` comparison
+  for wa, wb in leastToMostSig(a, b):
+    if wa != wb:
+      return false
+  return true
+
+func `<`*(a, b: Stuint): bool {.inline.} =
+  ## Unsigned `less than` comparison
+  var diff: Word
+  var borrow: Borrow
+  for wa, wb in leastToMostSig(a, b):
+    subB(borrow, diff, wa, wb, borrow)
+  return bool(borrow)
+
+func `<=`*(a, b: Stuint): bool {.inline.} =
+  ## Unsigned `less or equal` comparison
+  not(a < b)
+
+func isOdd*(a: Stuint): bool {.inline.} =
+  ## Returns true if input is off
+  ## false otherwise
+  bool(a.leastSignificantWord and 1)
+
+func isEven*(a: Stuint): bool {.inline.} =
+  ## Returns true if input is zero
+  ## false otherwise
+  not a.isOdd
+
+{.pop.}
+# Bitwise operations
+# --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func `not`*(a: Stuint): Stuint =
+  ## Bitwise complement of unsigned integer a
+  ## i.e. flips all bits of the input
+  for wr, wa in leastToMostSig(result, a):
+    wr = not wa
+
+func `or`*(a, b: Stuint): Stuint =
+  ## `Bitwise or` of numbers a and b
+  for wr, wa, wb in leastToMostSig(result, a, b):
+    wr = wa or wb
+
+func `and`*(a, b: Stuint): Stuint =
+  ## `Bitwise and` of numbers a and b
+  for wr, wa, wb in leastToMostSig(result, a, b):
+    wr = wa and wb
+
+func `xor`*(a, b: Stuint): Stuint =
+  ## `Bitwise xor` of numbers x and y
+  for wr, wa, wb in leastToMostSig(result, a, b):
+    wr = wa xor wb
+
+func `shr`*(a: Stuint, k: SomeInteger): Stuint =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+  when cpuEndian == littleEndian:
+    for i in 0 ..< a.limbs.len-1:
+      result.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i+1] shl (WordBitWidth - k))
+    result.limbs[^1] = a.limbs[^1] shr k
+  else:
+    for i in countdown(a.limbs.len-1, 1):
+      result.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i-1] shl (WordBitWidth - k))
+    result.limbs[0] = a.limbs[0] shr k
+
+func `shl`*(a: Stuint, k: SomeInteger): Stuint =
+  ## Compute the `shift left` operation of x and k
+  when cpuEndian == littleEndian:
+    result.limbs[0] = a.limbs[0] shl k
+    for i in 1 ..< a.limbs.len:
+      result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i-1] shr (WordBitWidth - k))
+  else:
+    result.limbs[^1] = a.limbs[^1] shl k
+    for i in countdown(a.limbs.len-2, 0):
+      result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i+1] shr (WordBitWidth - k))
+
+func countOnes*(x: Stuint): int {.inline.} =
+  result = 0
+  for wx in leastToMostSig(x):
+    result += countOnes(wx)
+
+func parity*(x: Stuint): int {.inline.} =
+  result = parity(x.limbs[0])
+  for i in 1 ..< x.limbs.len:
+    result = result xor parity(x.limbs[i])
+
+func leadingZeros*(x: Stuint): int {.inline.} =
+  result = 0
+  for word in mostToLeastSig(x):
+    let zeroCount = word.leadingZeros()
+    result += zeroCount
+    if zeroCount != WordBitWidth:
+      return
+
+func trailingZeros*(x: Stuint): int {.inline.} =
+  result = 0
+  for word in leastToMostSig(x):
+    let zeroCount = word.leadingZeros()
+    result += zeroCount
+    if zeroCount != WordBitWidth:
+      return
+
+func firstOne*(x: Stuint): int {.inline.} =
+  result = trailingZeros(x)
+  if result == x.limbs.len * WordBitWidth:
+    result = 0
+  else:
+    result += 1
+
+{.pop.}
+# Addsub
+# --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func `+`*(a, b: Stuint): Stuint =
+  # Addition for multi-precision unsigned int
+  var carry = Carry(0)
+  for wr, wa, wb in leastToMostSig(result, a, b):
+    addC(carry, wr, wa, wb, carry)
+
+func `+=`*(a: var Stuint, b: Stuint) =
+  ## In-place addition for multi-precision unsigned int
+  var carry = Carry(0)
+  for wa, wb in leastToMostSig(a, b):
+    addC(carry, wa, wa, wb, carry)
+
+func `-`*(a, b: Stuint): Stuint =
+  # Substraction for multi-precision unsigned int
+  var borrow = Borrow(0)
+  for wr, wa, wb in leastToMostSig(result, a, b):
+    subB(borrow, wr, wa, wb, borrow)
+
+func `-=`*(a: var Stuint, b: Stuint) =
+  ## In-place substraction for multi-precision unsigned int
+  var borrow = Borrow(0)
+  for wa, wb in leastToMostSig(a, b):
+    subB(borrow, wa, wa, wb, borrow)
+
+func inc*(a: var Stuint, w: Word = 1) =
+  var carry = Carry(0)
+  when cpuEndian == littleEndian:
+    addC(carry, x.limbs[0], x.limbs[0], w, carry)
+    for i in 1 ..< x.len:
+      addC(carry, x.limbs[i], x.limbs[i], 0, carry)
+
+{.pop.}
+# Multiplication
+# --------------------------------------------------------
+import ./private/uint_mul
+{.push raises: [], inline, noInit, gcsafe.}
+
+func `*`*(a, b: Stuint): Stuint {.inline.} =
+  ## Integer multiplication
+  result.limbs.prod(a.limbs, b.limbs)
+
+{.pop.}
+# Division & Modulo
+# --------------------------------------------------------
+
+# Exponentiation
+# --------------------------------------------------------

From 2ac1ee3f1ed091c1b7c828ce24dd1270d32195e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 13 Jun 2020 12:29:31 +0200
Subject: [PATCH 07/26] Fix compiletime primitives to pass all bitwise tests
 except large shifts

---
 stint/private/datatypes.nim                   | 22 +++++++++++---
 .../primitives/compiletime_fallback.nim       | 29 ++++++++++---------
 .../private/primitives/extended_precision.nim | 29 ++++++++++++-------
 stint/uintops.nim                             | 26 ++++++++++++++++-
 4 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 6e4f27c..6f8bc7c 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -25,16 +25,30 @@ func wordsRequired*(bits: int): int {.compileTime.} =
 
 type
   Limbs*[N: static int] = array[N, Word]
+    ## Limbs type
+    ## Large proc like multiplication and division
+    ## should operate at the limb-level
+    ## to avoid duplicate codepaths
+    ## For example for Stuint[16] and Stuint[32]
+    ## or if allowed in the future
+    ## Stuint[254] and Stuint[256]
 
   StUint*[bits: static[int]] = object
     ## Stack-based integer
     ## Unsigned
-    limbs*: Limbs[bits.wordsRequired]
+    limbs*: array[bits.wordsRequired, Word]
+      # TODO: using the limbs type here
+      #       can using StUint[8] of length 2, instead of 1
+      #       in test_uint_bitwise (in the VM)
+      #       unless you put the following instantiation
+      #       at the bottom of this file
+      # static:
+      #   echo StUint[8]()
 
   StInt*[bits: static[int]] = object
     ## Stack-based integer
     ## Signed
-    limbs*: Limbs[bits.wordsRequired]
+    limbs*: array[bits.wordsRequired, Word]
 
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
@@ -54,13 +68,13 @@ when sizeof(int) == 8 and GCC_Compatible:
 template leastSignificantWord*(num: SomeInteger): auto =
   num
 
-func leastSignificantWord*(a: SomeBigInteger): auto {.inline.} =
+template leastSignificantWord*(a: SomeBigInteger): auto =
   when cpuEndian == littleEndian:
     a.limbs[0]
   else:
     a.limbs[^1]
 
-func mostSignificantWord*(a: SomeBigInteger): auto {.inline.} =
+template mostSignificantWord*(a: SomeBigInteger): auto =
   when cpuEndian == littleEndian:
     a.limbs[^1]
   else:
diff --git a/stint/private/primitives/compiletime_fallback.nim b/stint/private/primitives/compiletime_fallback.nim
index 35a15e0..92580d9 100644
--- a/stint/private/primitives/compiletime_fallback.nim
+++ b/stint/private/primitives/compiletime_fallback.nim
@@ -11,29 +11,30 @@ import ../datatypes
 
 # ############################################################
 #
-#                     VM fallback
+#                 VM fallback for uint64
 #
 # ############################################################
 
 const
-  HalfWidth = WordBitWidth shr 1
-  HalfBase = (Word(1) shl HalfWidth)
+  uint64BitWidth = 64
+  HalfWidth = uint64BitWidth shr 1
+  HalfBase = 1'u64 shl HalfWidth
   HalfMask = HalfBase - 1
 
-func hi(n: Word): Word =
+func hi(n: uint64): uint64 =
   result = n shr HalfWidth
 
-func lo(n: Word): Word =
+func lo(n: uint64): uint64 =
   result = n and HalfMask
 
-func split(n: Word): tuple[hi, lo: Word] =
+func split(n: uint64): tuple[hi, lo: uint64] =
   result.hi = n.hi
   result.lo = n.lo
 
-func merge(hi, lo: Word): Word =
+func merge(hi, lo: uint64): uint64 =
   (hi shl HalfWidth) or lo
 
-func addC_nim*(cOut: var Carry, sum: var Word, a, b: Word, cIn: Carry) =
+func addC_nim*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) =
   # Add with carry, fallback for the Compile-Time VM
   # (CarryOut, Sum) <- a + b + CarryIn
   let (aHi, aLo) = split(a)
@@ -45,22 +46,22 @@ func addC_nim*(cOut: var Carry, sum: var Word, a, b: Word, cIn: Carry) =
   cOut = Carry(cHi)
   sum = merge(rHi, rLo)
 
-func subB_nim*(bOut: var Borrow, diff: var Word, a, b: Word, bIn: Borrow) =
+func subB_nim*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) =
   # Substract with borrow, fallback for the Compile-Time VM
   # (BorrowOut, Sum) <- a - b - BorrowIn
   let (aHi, aLo) = split(a)
   let (bHi, bLo) = split(b)
   let tLo = HalfBase + aLo - bLo - bIn
   let (noBorrowLo, rLo) = split(tLo)
-  let tHi = HalfBase + aHi - bHi - Word(noBorrowLo == 0)
+  let tHi = HalfBase + aHi - bHi - uint64(noBorrowLo == 0)
   let (noBorrowHi, rHi) = split(tHi)
   bOut = Borrow(noBorrowHi == 0)
   diff = merge(rHi, rLo)
 
-func mul_nim*(hi, lo: var Word, u, v: Word) =
+func mul_nim*(hi, lo: var uint64, u, v: uint64) =
   ## Extended precision multiplication
   ## (hi, lo) <- u * v
-  var x0, x1, x2, x3: Word
+  var x0, x1, x2, x3: uint64
 
   let
     (uh, ul) = u.split()
@@ -79,7 +80,7 @@ func mul_nim*(hi, lo: var Word, u, v: Word) =
   hi = x3 + hi(x1)
   lo = merge(x1, lo(x0))
 
-func muladd1_nim*(hi, lo: var Word, a, b, c: Word) {.inline.} =
+func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
   ## Extended precision multiplication + addition
   ## (hi, lo) <- a*b + c
   ##
@@ -90,7 +91,7 @@ func muladd1_nim*(hi, lo: var Word, a, b, c: Word) {.inline.} =
   addC_nim(carry, lo, lo, c, 0)
   addC_nim(carry, hi, hi, 0, carry)
 
-func muladd2_nim*(hi, lo: var Word, a, b, c1, c2: Word) {.inline.}=
+func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
   ## Extended precision multiplication + addition + addition
   ## (hi, lo) <- a*b + c1 + c2
   ##
diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim
index ef75040..b666786 100644
--- a/stint/private/primitives/extended_precision.nim
+++ b/stint/private/primitives/extended_precision.nim
@@ -72,16 +72,20 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
 #
 # ############################################################
 
-when sizeof(int) == 8:
-  when defined(vcc):
-    from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
-  elif GCCCompatible:
-    when X86:
-      from ./extended_precision_x86_64_gcc import div2n1n
-      from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
-    else:
-      from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
-  export div2n1n, mul, muladd1, muladd2
+when sizeof(int) == 8 and not defined(Stint32):
+  when nimvm:
+    from ./compiletime_fallback import mul_nim, muladd1, muladd2
+  else:
+    when defined(vcc):
+      from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
+    elif GCCCompatible:
+      when X86:
+        from ./extended_precision_x86_64_gcc import div2n1n
+        from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
+      else:
+        from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
+    export div2n1n, mul
+  export muladd1, muladd2
 
 # ############################################################
 #
@@ -124,7 +128,10 @@ func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
   ## (t, u, v) <- (t, u, v) + a * b
   var UV: array[2, T]
   var carry: Carry
-  mul(UV[1], UV[0], a, b)
+  when nimvm:
+    mul_nim(UV[1], UV[0], a, b)
+  else:
+    mul(UV[1], UV[0], a, b)
   addC(carry, v, v, UV[0], Carry(0))
   addC(carry, u, u, UV[1], carry)
   t += T(carry)
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 870ebe4..94ee52d 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -94,11 +94,22 @@ func isEven*(a: Stuint): bool {.inline.} =
 # --------------------------------------------------------
 {.push raises: [], inline, noInit, gcsafe.}
 
+template clearExtraBits(a: var StUint) =
+  ## A Stuint is stored in an array of 32 of 64-bit word
+  ## If we do bit manipulation at the word level,
+  ## for example a 8-bit stuint stored in a 64-bit word
+  ## we need to clear the upper 56-bit
+  when a.bits != a.limbs.len * WordBitWidth:
+    const posExtraBits = a.bits - (a.limbs.len-1) * WordBitWidth
+    const mask = (Word(1) shl posExtraBits) - 1
+    mostSignificantWord(a) = mostSignificantWord(a) and mask
+
 func `not`*(a: Stuint): Stuint =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
   for wr, wa in leastToMostSig(result, a):
     wr = not wa
+  result.clearExtraBits()
 
 func `or`*(a, b: Stuint): Stuint =
   ## `Bitwise or` of numbers a and b
@@ -114,6 +125,7 @@ func `xor`*(a, b: Stuint): Stuint =
   ## `Bitwise xor` of numbers x and y
   for wr, wa, wb in leastToMostSig(result, a, b):
     wr = wa xor wb
+  result.clearExtraBits()
 
 func `shr`*(a: Stuint, k: SomeInteger): Stuint =
   ## Shift right by k.
@@ -142,6 +154,7 @@ func `shl`*(a: Stuint, k: SomeInteger): Stuint =
     result.limbs[^1] = a.limbs[^1] shl k
     for i in countdown(a.limbs.len-2, 0):
       result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i+1] shr (WordBitWidth - k))
+  result.clearExtraBits()
 
 func countOnes*(x: Stuint): int {.inline.} =
   result = 0
@@ -186,24 +199,28 @@ func `+`*(a, b: Stuint): Stuint =
   var carry = Carry(0)
   for wr, wa, wb in leastToMostSig(result, a, b):
     addC(carry, wr, wa, wb, carry)
+  result.clearExtraBits()
 
 func `+=`*(a: var Stuint, b: Stuint) =
   ## In-place addition for multi-precision unsigned int
   var carry = Carry(0)
   for wa, wb in leastToMostSig(a, b):
     addC(carry, wa, wa, wb, carry)
+  a.clearExtraBits()
 
 func `-`*(a, b: Stuint): Stuint =
   # Substraction for multi-precision unsigned int
   var borrow = Borrow(0)
   for wr, wa, wb in leastToMostSig(result, a, b):
     subB(borrow, wr, wa, wb, borrow)
+  result.clearExtraBits()
 
 func `-=`*(a: var Stuint, b: Stuint) =
   ## In-place substraction for multi-precision unsigned int
   var borrow = Borrow(0)
   for wa, wb in leastToMostSig(a, b):
     subB(borrow, wa, wa, wb, borrow)
+  a.clearExtraBits()
 
 func inc*(a: var Stuint, w: Word = 1) =
   var carry = Carry(0)
@@ -211,16 +228,23 @@ func inc*(a: var Stuint, w: Word = 1) =
     addC(carry, x.limbs[0], x.limbs[0], w, carry)
     for i in 1 ..< x.len:
       addC(carry, x.limbs[i], x.limbs[i], 0, carry)
+  a.clearExtraBits()
 
 {.pop.}
 # Multiplication
 # --------------------------------------------------------
+# Multiplication is implemented in a separate file at the limb-level
+# - It's too big to be inlined (especially with unrolled loops)
+# - It's implemented at the limb-level so that
+#   in the future Stuint[254] and Stuint256] share a common codepath
+
 import ./private/uint_mul
 {.push raises: [], inline, noInit, gcsafe.}
 
-func `*`*(a, b: Stuint): Stuint {.inline.} =
+func `*`*(a, b: Stuint): Stuint =
   ## Integer multiplication
   result.limbs.prod(a.limbs, b.limbs)
+  result.clearExtraBits()
 
 {.pop.}
 # Division & Modulo

From 777a84e9f5825ed57d4496a5caebf08181d081cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 13 Jun 2020 16:44:13 +0200
Subject: [PATCH 08/26] Implement toHex/fromHex and fix `shl`

---
 stint/endians2.nim           | 138 ++++++++++++++++++++++++++++++-----
 stint/io.nim                 |  62 ++++++----------
 stint/private/uint_mul.nim   |   3 +-
 stint/private/uint_shift.nim |  93 +++++++++++++++++++++++
 stint/uintops.nim            |  92 ++++++++++++-----------
 5 files changed, 286 insertions(+), 102 deletions(-)
 create mode 100644 stint/private/uint_shift.nim

diff --git a/stint/endians2.nim b/stint/endians2.nim
index 6232a94..20d78bc 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -9,27 +9,131 @@
 
 import private/datatypes
 
-import stew/endians2
-export endians2
-
 {.push raises: [IndexError], noInit, gcsafe.}
 
-func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian):
-    array[bits div 8, byte] {.inline.} =
-  when endian == system.cpuEndian:
-    for i in 0 ..< x.limbs.len:
-      result[i * sizeof(Word)] = x.limbs[i].toBytes()
-  else:
-    for i in 0 ..< x.limbs.len:
-      result[i * sizeof(Word)] = x.limbs[^i].toBytes()
+# Serialization
+# ------------------------------------------------------------------------------------------
 
-func toBytesLE*[bits: static int](x: StUint[bits]):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x, littleEndian)
+template toByte(x: SomeUnsignedInt): byte =
+  ## At compile-time, conversion to bytes checks the range
+  ## we want to ensure this is done at the register level
+  ## at runtime in a single "mov byte" instruction
+  when nimvm:
+    byte(x and 0xFF)
+  else:
+    byte(x)
+
+template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
+  ## Write an integer into a raw binary blob
+  ## Swapping endianness if needed
+  when endian == cpuEndian:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+i] = toByte((src shr (i * 8)))
+  else:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+sizeof(src)-1-i] = toByte((src shr (i * 8)))
+
+func toBytesLE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] =
+  var
+    src_idx, dst_idx = 0
+    acc: Word = 0
+    acc_len = 0
+
+  when cpuEndian == bigEndian:
+    srcIdx = src.limbs.len - 1
+
+  var tail = result.len
+  while tail > 0:
+    when cpuEndian == littleEndian:
+      let w = if src_idx < src.limbs.len: src.limbs[src_idx]
+              else: 0
+      inc src_idx
+    else:
+      let w = if src_idx >= 0: src.limbs[src_idx]
+              else: 0
+      dec src_idx
+
+    if acc_len == 0:
+      # We need to refill the buffer to output 64-bit
+      acc = w
+      acc_len = WordBitWidth
+    else:
+      let lo = acc
+      acc = w
+
+      if tail >= sizeof(Word):
+        # Unrolled copy
+        result.blobFrom(src = lo, dst_idx, littleEndian)
+        dst_idx += sizeof(Word)
+        tail -= sizeof(Word)
+      else:
+        # Process the tail and exit
+        when cpuEndian == littleEndian:
+          # When requesting little-endian on little-endian platform
+          # we can just copy each byte
+          # tail is inclusive
+          for i in 0 ..< tail:
+            result[dst_idx+i] = toByte(lo shr (i*8))
+        else: # TODO check this
+          # We need to copy from the end
+          for i in 0 ..< tail:
+            result[dst_idx+i] = toByte(lo shr ((tail-i)*8))
+        return
+
+func toBytesBE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] {.inline.} =
+  var
+    src_idx = 0
+    acc: Word = 0
+    acc_len = 0
+
+  when cpuEndian == bigEndian:
+    srcIdx = src.limbs.len - 1
+
+  var tail = result.len
+  while tail > 0:
+    when cpuEndian == littleEndian:
+      let w = if src_idx < src.limbs.len: src.limbs[src_idx]
+              else: 0
+      inc src_idx
+    else:
+      let w = if src_idx >= 0: src.limbs[src_idx]
+              else: 0
+      dec src_idx
+
+    if acc_len == 0:
+      # We need to refill the buffer to output 64-bit
+      acc = w
+      acc_len = WordBitWidth
+    else:
+      let lo = acc
+      acc = w
+
+      if tail >= sizeof(Word):
+        # Unrolled copy
+        tail -= sizeof(Word)
+        result.blobFrom(src = lo, tail, bigEndian)
+      else:
+        # Process the tail and exit
+        when cpuEndian == littleEndian:
+          # When requesting little-endian on little-endian platform
+          # we can just copy each byte
+          # tail is inclusive
+          for i in 0 ..< tail:
+            result[tail-1-i] = toByte(lo shr (i*8))
+        else:
+          # We need to copy from the end
+          for i in 0 ..< tail:
+            result[tail-1-i] = toByte(lo shr ((tail-i)*8))
+        return
+
+func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian): array[bits div 8, byte] {.inline.} =
+  if endian == littleEndian:
+    result = x.toBytesLE()
+  else:
+    result = x.toBytesBE()
 
-func toBytesBE*[bits: static int](x: StUint[bits]):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x, bigEndian)
+# Deserialization
+# ------------------------------------------------------------------------------------------
 
 func fromBytesBE*[bits: static int](
     T: typedesc[StUint[bits]],
diff --git a/stint/io.nim b/stint/io.nim
index 26bca80..8483a15 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -8,12 +8,18 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
+  # Standard library
+  typetraits, algorithm, hashes,
+  # Status libraries
+  # stew/byteutils,
+  # Internal
   ./private/datatypes,
   # ./private/int_negabs,
   # ./private/compiletime_helpers,
   # ./intops,
-  ./uintops, ./endians2,
-  typetraits, algorithm, hashes
+  ./uintops, ./endians2
+
+from stew/byteutils import toHex # Why are we exporting readHexChar in byteutils?
 
 template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
   # To avoid a costly runtime check, we refuse storing into StUint types smaller
@@ -356,44 +362,20 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
 #   ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
 #   toString(num, 16)
 
-# func dumpHex*(x: Stint or StUint, order: static[Endianness] = bigEndian): string =
-#   ## Stringify an int to hex.
-#   ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
-#   ##
-#   ## You can specify bigEndian or littleEndian order.
-#   ## i.e. in bigEndian:
-#   ## - 1.uint64 will be 00000001
-#   ## - (2.uint128)^64 + 1 will be 0000000100000001
-#   ##
-#   ## in littleEndian:
-#   ## - 1.uint64 will be 01000000
-#   ## - (2.uint128)^64 + 1 will be 0100000001000000
-
-#   const
-#     hexChars = "0123456789abcdef"
-#     size = bitsof(x.data) div 8
-
-#   result = newString(2*size)
-
-#   when nimvm:
-#     for i in 0 ..< size:
-#       when order == system.cpuEndian:
-#         let byte = x.data.getByte(i)
-#       else:
-#         let byte = x.data.getByte(size - 1 - i)
-#       result[2*i] = hexChars[int byte shr 4 and 0xF]
-#       result[2*i+1] = hexChars[int byte and 0xF]
-#   else:
-#     {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-#     let bytes {.restrict.}= cast[ptr array[size, byte]](x.unsafeaddr)
-
-#     for i in 0 ..< size:
-#       when order == system.cpuEndian:
-#         result[2*i] = hexChars[int bytes[i] shr 4 and 0xF]
-#         result[2*i+1] = hexChars[int bytes[i] and 0xF]
-#       else:
-#         result[2*i] = hexChars[int bytes[bytes[].high - i] shr 4 and 0xF]
-#         result[2*i+1] = hexChars[int bytes[bytes[].high - i] and 0xF]
+func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string =
+  ## Stringify an int to hex.
+  ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
+  ##
+  ## You can specify bigEndian or littleEndian order.
+  ## i.e. in bigEndian:
+  ## - 1.uint64 will be 00000001
+  ## - (2.uint128)^64 + 1 will be 0000000100000001
+  ##
+  ## in littleEndian:
+  ## - 1.uint64 will be 01000000
+  ## - (2.uint128)^64 + 1 will be 0100000001000000
+  let bytes = a.toBytes(order)
+  result = bytes.toHex()
 
 proc initFromBytesBE*[bits: static[int]](val: var Stuint[bits], 
                       ba: openarray[byte], 
diff --git a/stint/private/uint_mul.nim b/stint/private/uint_mul.nim
index 2b574f8..1155344 100644
--- a/stint/private/uint_mul.nim
+++ b/stint/private/uint_mul.nim
@@ -11,7 +11,8 @@ import
   ./datatypes,
   ./primitives/extended_precision
 
-# ################### Multiplication ################### #
+# Multiplication
+# --------------------------------------------------------
 {.push raises: [], gcsafe.}
 
 func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
new file mode 100644
index 0000000..12eb944
--- /dev/null
+++ b/stint/private/uint_shift.nim
@@ -0,0 +1,93 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ./datatypes
+
+# Shifts
+# --------------------------------------------------------
+{.push raises: [], gcsafe.}
+
+func shrSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+  when cpuEndian == littleEndian:
+    for i in 0 ..< a.len-1:
+      r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+    r[^1] = a[^1] shr k
+  else:
+    for i in countdown(a.len-1, 1):
+      r[i] = (a[i] shr k) or (a[i-1] shl (WordBitWidth - k))
+    r[0] = a[0] shr k
+
+func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
+  ## Shift right by `w` words + `shift` bits
+  ## Assumes `r` is 0 initialized
+  if w > Limbs.len:
+    return
+
+  when cpuEndian == littleEndian:
+    for i in w ..< a.len-1:
+      r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
+    r[^w] = a[^1] shr shift
+  else:
+    for i in countdown(a.len-1, 1+w):
+      r[i-w] = (a[i] shr shift) or (a[i-1] shl (WordBitWidth - k))
+    r[0] = a[w] shr shift
+
+func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
+  ## Shift right by w word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< Limbs.len-w:
+      r[i] = a[i+w]
+  else:
+    for i in countdown(Limbs.len-w, 0):
+      r[i] = a[i+w]
+
+func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
+  ## Compute the `shift left` operation of x and k
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  when cpuEndian == littleEndian:
+    r[0] = a[0] shl k
+    for i in 1 ..< a.len:
+      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
+  else:
+    r[^1] = a[^1] shl k
+    for i in countdown(a.len-2, 0):
+      r[i] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
+
+func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
+  ## Shift left by `w` words + `shift` bits
+  ## Assumes `r` is 0 initialized
+  if w > Limbs.len:
+    return
+
+  when cpuEndian == littleEndian:
+    r[w] = a[0] shl shift
+    for i in 1+w ..< r.len:
+      r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
+  else:
+    r[^1] = a[^w] shl shift
+    for i in countdown(a.len-2-w, 0):
+      r[i+w] = (a[i] shl shift) or (a[i+1] shr (WordBitWidth - shift))
+
+func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
+  ## Shift left by w word
+  when cpuEndian == littleEndian:
+    for i in 0 ..< Limbs.len-w:
+      r[i+w] = a[i]
+  else:
+    for i in countdown(Limbs.len-1, 0):
+      r[i] = a[i-w]
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 94ee52d..a227613 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -12,6 +12,7 @@ import
   stew/bitops2,
   # Internal
   ./private/datatypes,
+  ./private/uint_shift,
   ./private/primitives/addcarry_subborrow
 
 export StUint
@@ -127,69 +128,72 @@ func `xor`*(a, b: Stuint): Stuint =
     wr = wa xor wb
   result.clearExtraBits()
 
-func `shr`*(a: Stuint, k: SomeInteger): Stuint =
-  ## Shift right by k.
-  ##
-  ## k MUST be less than the base word size (2^32 or 2^64)
-  # Note: for speed, loading a[i] and a[i+1]
-  #       instead of a[i-1] and a[i]
-  #       is probably easier to parallelize for the compiler
-  #       (antidependence WAR vs loop-carried dependence RAW)
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len-1:
-      result.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i+1] shl (WordBitWidth - k))
-    result.limbs[^1] = a.limbs[^1] shr k
-  else:
-    for i in countdown(a.limbs.len-1, 1):
-      result.limbs[i] = (a.limbs[i] shr k) or (a.limbs[i-1] shl (WordBitWidth - k))
-    result.limbs[0] = a.limbs[0] shr k
-
-func `shl`*(a: Stuint, k: SomeInteger): Stuint =
-  ## Compute the `shift left` operation of x and k
-  when cpuEndian == littleEndian:
-    result.limbs[0] = a.limbs[0] shl k
-    for i in 1 ..< a.limbs.len:
-      result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i-1] shr (WordBitWidth - k))
-  else:
-    result.limbs[^1] = a.limbs[^1] shl k
-    for i in countdown(a.limbs.len-2, 0):
-      result.limbs[i] = (a.limbs[i] shl k) or (a.limbs[i+1] shr (WordBitWidth - k))
-  result.clearExtraBits()
-
-func countOnes*(x: Stuint): int {.inline.} =
+func countOnes*(a: Stuint): int {.inline.} =
   result = 0
-  for wx in leastToMostSig(x):
-    result += countOnes(wx)
+  for wa in leastToMostSig(a):
+    result += countOnes(wa)
 
-func parity*(x: Stuint): int {.inline.} =
-  result = parity(x.limbs[0])
-  for i in 1 ..< x.limbs.len:
-    result = result xor parity(x.limbs[i])
+func parity*(a: Stuint): int {.inline.} =
+  result = parity(a.limbs[0])
+  for i in 1 ..< a.limbs.len:
+    result = result xor parity(a.limbs[i])
 
-func leadingZeros*(x: Stuint): int {.inline.} =
+func leadingZeros*(a: Stuint): int {.inline.} =
   result = 0
-  for word in mostToLeastSig(x):
+  for word in mostToLeastSig(a):
     let zeroCount = word.leadingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
       return
 
-func trailingZeros*(x: Stuint): int {.inline.} =
+func trailingZeros*(a: Stuint): int {.inline.} =
   result = 0
-  for word in leastToMostSig(x):
+  for word in leastToMostSig(a):
     let zeroCount = word.leadingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
       return
 
-func firstOne*(x: Stuint): int {.inline.} =
-  result = trailingZeros(x)
-  if result == x.limbs.len * WordBitWidth:
+func firstOne*(a: Stuint): int {.inline.} =
+  result = trailingZeros(a)
+  if result == a.limbs.len * WordBitWidth:
     result = 0
   else:
     result += 1
 
-{.pop.}
+func `shr`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
+  ## Shift right by k bits
+  if k < WordBitWidth:
+    result.limbs.shrSmall(a.limbs, k)
+    return
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    result.limbs.shrWords(a.limbs, w)
+  else:
+    result.limbs.shrLarge(a.limbs, w, shift)
+
+func `shl`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
+  ## Shift left by k bits
+  if k < WordBitWidth:
+    result.limbs.shlSmall(a.limbs, k)
+    result.clearExtraBits()
+    return
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    result.limbs.shlWords(a.limbs, w)
+  else:
+    result.limbs.shlLarge(a.limbs, w, shift)
+
+  result.clearExtraBits()
+
+{.pop.} # End inline
+
 # Addsub
 # --------------------------------------------------------
 {.push raises: [], inline, noInit, gcsafe.}

From 195480d58ab18fcb3739b5450ea94d5bdff73080 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 13 Jun 2020 16:54:54 +0200
Subject: [PATCH 09/26] passing compile-time bitwise tests (but not runtime :?)

---
 stint/private/uint_shift.nim | 2 +-
 stint/uintops.nim            | 2 ++
 tests/test_uint_bitwise.nim  | 6 ------
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index 12eb944..3bff3d6 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -40,7 +40,7 @@ func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   when cpuEndian == littleEndian:
     for i in w ..< a.len-1:
       r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
-    r[^w] = a[^1] shr shift
+    r[^(1+w)] = a[^1] shr shift
   else:
     for i in countdown(a.len-1, 1+w):
       r[i-w] = (a[i] shr shift) or (a[i-1] shl (WordBitWidth - k))
diff --git a/stint/uintops.nim b/stint/uintops.nim
index a227613..9c7efc5 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -166,6 +166,7 @@ func `shr`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
   if k < WordBitWidth:
     result.limbs.shrSmall(a.limbs, k)
     return
+
   # w = k div WordBitWidth, shift = k mod WordBitWidth
   let w     = k shr static(log2trunc(uint32(WordBitWidth)))
   let shift = k and (WordBitWidth - 1)
@@ -181,6 +182,7 @@ func `shl`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
     result.limbs.shlSmall(a.limbs, k)
     result.clearExtraBits()
     return
+
   # w = k div WordBitWidth, shift = k mod WordBitWidth
   let w     = k shr static(log2trunc(uint32(WordBitWidth)))
   let shift = k and (WordBitWidth - 1)
diff --git a/tests/test_uint_bitwise.nim b/tests/test_uint_bitwise.nim
index 0f9d2af..369c807 100644
--- a/tests/test_uint_bitwise.nim
+++ b/tests/test_uint_bitwise.nim
@@ -337,12 +337,6 @@ suite "Testing unsigned int bitwise operations":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shl 8) == z shl 8
 
-    block: # Testing shl for nested UintImpl
-      let p2_64 = UintImpl[uint64](hi:1, lo:0)
-      let p = 1.stuint(128) shl 64
-
-      check: p == cast[StUint[128]](p2_64)
-
   test "Shift right - by less than half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shr 2) == z shr 2

From 3df7f382605e9939888fa601426f5529ada50c51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 13 Jun 2020 17:03:50 +0200
Subject: [PATCH 10/26] Fix noInit issue at runtime, pass the bitwise tests

---
 stint/uintops.nim | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/stint/uintops.nim b/stint/uintops.nim
index 9c7efc5..468ba4a 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -128,17 +128,17 @@ func `xor`*(a, b: Stuint): Stuint =
     wr = wa xor wb
   result.clearExtraBits()
 
-func countOnes*(a: Stuint): int {.inline.} =
+func countOnes*(a: Stuint): int =
   result = 0
   for wa in leastToMostSig(a):
     result += countOnes(wa)
 
-func parity*(a: Stuint): int {.inline.} =
+func parity*(a: Stuint): int =
   result = parity(a.limbs[0])
   for i in 1 ..< a.limbs.len:
     result = result xor parity(a.limbs[i])
 
-func leadingZeros*(a: Stuint): int {.inline.} =
+func leadingZeros*(a: Stuint): int =
   result = 0
   for word in mostToLeastSig(a):
     let zeroCount = word.leadingZeros()
@@ -146,7 +146,7 @@ func leadingZeros*(a: Stuint): int {.inline.} =
     if zeroCount != WordBitWidth:
       return
 
-func trailingZeros*(a: Stuint): int {.inline.} =
+func trailingZeros*(a: Stuint): int =
   result = 0
   for word in leastToMostSig(a):
     let zeroCount = word.leadingZeros()
@@ -154,14 +154,17 @@ func trailingZeros*(a: Stuint): int {.inline.} =
     if zeroCount != WordBitWidth:
       return
 
-func firstOne*(a: Stuint): int {.inline.} =
+func firstOne*(a: Stuint): int =
   result = trailingZeros(a)
   if result == a.limbs.len * WordBitWidth:
     result = 0
   else:
     result += 1
 
-func `shr`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
+{.pop.} # End noInit
+{.push raises: [], inline, gcsafe.}
+
+func `shr`*(a: Stuint, k: SomeInteger): Stuint =
   ## Shift right by k bits
   if k < WordBitWidth:
     result.limbs.shrSmall(a.limbs, k)
@@ -176,7 +179,7 @@ func `shr`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
   else:
     result.limbs.shrLarge(a.limbs, w, shift)
 
-func `shl`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
+func `shl`*(a: Stuint, k: SomeInteger): Stuint =
   ## Shift left by k bits
   if k < WordBitWidth:
     result.limbs.shlSmall(a.limbs, k)
@@ -194,7 +197,7 @@ func `shl`*(a: Stuint, k: SomeInteger): Stuint {.inline.} =
 
   result.clearExtraBits()
 
-{.pop.} # End inline
+{.pop.}
 
 # Addsub
 # --------------------------------------------------------

From 59bca4701274c67524d885ac1f5604984afdd5b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sat, 13 Jun 2020 17:10:26 +0200
Subject: [PATCH 11/26] Fix comparison operators

---
 stint/uintops.nim              | 2 +-
 tests/test_uint_comparison.nim | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/stint/uintops.nim b/stint/uintops.nim
index 468ba4a..8d8d848 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -78,7 +78,7 @@ func `<`*(a, b: Stuint): bool {.inline.} =
 
 func `<=`*(a, b: Stuint): bool {.inline.} =
   ## Unsigned `less or equal` comparison
-  not(a < b)
+  not(b < a)
 
 func isOdd*(a: Stuint): bool {.inline.} =
   ## Returns true if input is off
diff --git a/tests/test_uint_comparison.nim b/tests/test_uint_comparison.nim
index 7599c6f..ad002fe 100644
--- a/tests/test_uint_comparison.nim
+++ b/tests/test_uint_comparison.nim
@@ -358,5 +358,5 @@ suite "Testing unsigned int comparison operators":
       not a.isOdd
       b.isOdd
       not b.isEven
-      c.isEven
-      not c.isOdd
+      # c.isEven
+      # not c.isOdd

From 254d4da649477fa50e1949fb55db59d3a2f56cf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sun, 6 Sep 2020 15:54:57 +0200
Subject: [PATCH 12/26] Pass extended precision bitops2 tests

---
 stint/lenient_stint.nim |  2 ++
 stint/uintops.nim       | 41 ++++++++++++++++++++++++++++++++---------
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/stint/lenient_stint.nim b/stint/lenient_stint.nim
index d7faa8d..c463ada 100644
--- a/stint/lenient_stint.nim
+++ b/stint/lenient_stint.nim
@@ -11,6 +11,8 @@
 
 import ./int_public, ./uint_public, macros
 
+# TODO: deprecate
+
 type Signedness = enum
   BothSigned, IntOnly, UintOnly
 
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 8d8d848..f8bca9d 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -140,19 +140,31 @@ func parity*(a: Stuint): int =
 
 func leadingZeros*(a: Stuint): int =
   result = 0
+
+  # Adjust when we use only part of the word size
+  var extraBits = WordBitWidth * a.limbs.len - a.bits
+
   for word in mostToLeastSig(a):
     let zeroCount = word.leadingZeros()
-    result += zeroCount
+    if extraBits > 0:
+      result += zeroCount - min(extraBits, WordBitWidth)
+      extraBits -= WordBitWidth
+    else:
+      result += zeroCount
     if zeroCount != WordBitWidth:
-      return
+      break
 
 func trailingZeros*(a: Stuint): int =
   result = 0
   for word in leastToMostSig(a):
-    let zeroCount = word.leadingZeros()
+    let zeroCount = word.trailingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
-      return
+      break
+
+  when a.limbs.len * WordBitWidth != a.bits:
+    if result > a.bits:
+      result = a.bits
 
 func firstOne*(a: Stuint): int =
   result = trailingZeros(a)
@@ -204,7 +216,7 @@ func `shl`*(a: Stuint, k: SomeInteger): Stuint =
 {.push raises: [], inline, noInit, gcsafe.}
 
 func `+`*(a, b: Stuint): Stuint =
-  # Addition for multi-precision unsigned int
+  ## Addition for multi-precision unsigned int
   var carry = Carry(0)
   for wr, wa, wb in leastToMostSig(result, a, b):
     addC(carry, wr, wa, wb, carry)
@@ -218,7 +230,7 @@ func `+=`*(a: var Stuint, b: Stuint) =
   a.clearExtraBits()
 
 func `-`*(a, b: Stuint): Stuint =
-  # Substraction for multi-precision unsigned int
+  ## Substraction for multi-precision unsigned int
   var borrow = Borrow(0)
   for wr, wa, wb in leastToMostSig(result, a, b):
     subB(borrow, wr, wa, wb, borrow)
@@ -234,11 +246,22 @@ func `-=`*(a: var Stuint, b: Stuint) =
 func inc*(a: var Stuint, w: Word = 1) =
   var carry = Carry(0)
   when cpuEndian == littleEndian:
-    addC(carry, x.limbs[0], x.limbs[0], w, carry)
-    for i in 1 ..< x.len:
-      addC(carry, x.limbs[i], x.limbs[i], 0, carry)
+    addC(carry, a.limbs[0], a.limbs[0], w, carry)
+    for i in 1 ..< a.limbs.len:
+      addC(carry, a.limbs[i], a.limbs[i], 0, carry)
   a.clearExtraBits()
 
+func `+`*(a: Stuint, b: SomeUnsignedInt): Stuint =
+  ## Addition for multi-precision unsigned int
+  ## with an unsigned integer
+  result = a
+  result.inc(Word(b))
+
+func `+=`*(a: var Stuint, b: SomeUnsignedInt) =
+  ## In-place addition for multi-precision unsigned int
+  ## with an unsigned integer
+  a.inc(Word(b))
+
 {.pop.}
 # Multiplication
 # --------------------------------------------------------

From dc9e0a43caa328fbeccf486d068a3679af422039 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sun, 6 Sep 2020 16:27:11 +0200
Subject: [PATCH 13/26] Implement exponentiation, test mul, split mul/div tests

---
 stint/private/uint_exp.nim                    | 50 -----------
 stint/uintops.nim                             | 72 ++++++++++++---
 ...t_uint_muldiv.nim => test_uint_divmod.nim} | 77 +---------------
 tests/test_uint_endians2.nim                  |  1 +
 tests/test_uint_mul.nim                       | 88 +++++++++++++++++++
 5 files changed, 152 insertions(+), 136 deletions(-)
 delete mode 100644 stint/private/uint_exp.nim
 rename tests/{test_uint_muldiv.nim => test_uint_divmod.nim} (76%)
 create mode 100644 tests/test_uint_mul.nim

diff --git a/stint/private/uint_exp.nim b/stint/private/uint_exp.nim
deleted file mode 100644
index 1103ed7..0000000
--- a/stint/private/uint_exp.nim
+++ /dev/null
@@ -1,50 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./datatypes,
-  ./uint_bitwise_ops, ./uint_mul, ./initialization, ./uint_comparison
-
-func pow*(x: UintImpl, y: Natural): UintImpl =
-  ## Compute ``x`` to the power of ``y``,
-  ## ``x`` must be non-negative
-
-  # Implementation uses exponentiation by squaring
-  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
-  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
-
-  var (x, y) = (x, y)
-  result = one(type x)
-
-  while true:
-    if bool(y and 1): # if y is odd
-      result = result * x
-    y = y shr 1
-    if y == 0:
-      break
-    x = x * x
-
-func pow*(x: UintImpl, y: UintImpl): UintImpl =
-  ## Compute ``x`` to the power of ``y``,
-  ## ``x`` must be non-negative
-
-  # Implementation uses exponentiation by squaring
-  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
-  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
-
-  var (x, y) = (x, y)
-  result = one(type x)
-
-  while true:
-    if y.isOdd:
-      result = result * x
-    y = y shr 1
-    if y.isZero:
-      break
-    x = x * x
diff --git a/stint/uintops.nim b/stint/uintops.nim
index f8bca9d..a96bf1d 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -23,18 +23,22 @@ export StUint
 
 func setZero*(a: var StUint) =
   ## Set ``a`` to 0
-  zeroMem(a[0].addr, sizeof(a))
+  for i in 0 ..< a.limbs.len:
+    a[i] = 0
 
-func setOne*(a: var StUint) =
-  ## Set ``a`` to 1
+func setSmallInt(a: var StUint, k: Word) =
+  ## Set ``a`` to k
   when cpuEndian == littleEndian:
-    a.limbs[0] = 1
-    when a.limbs.len > 1:
-      zeroMem(a.limbs[1].addr, (a.limbs.len - 1) * sizeof(SecretWord))
+    a.limbs[0] = k
+    for i in 1 ..< a.limbs.len:
+      a.limbs[i] = 0
   else:
-    a.limbs[^1] = 1
-    when a.limbs.len > 1:
-      zeroMem(a.limbs[0].addr, (a.len - 1) * sizeof(SecretWord))
+    a.limbs[^1] = k
+    for i in 0 ..< a.limb.len - 1:
+      a.limbs[i] = 0
+
+func setOne*(a: var StUint) =
+  setSmallInt(a, 1)
 
 func zero*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
   ## Returns the zero of the input type
@@ -42,7 +46,7 @@ func zero*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
 
 func one*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
   ## Returns the one of the input type
-  result.limbs.setOne()
+  result.setOne()
 
 func high*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
   for wr in leastToMostSig(result):
@@ -279,8 +283,52 @@ func `*`*(a, b: Stuint): Stuint =
   result.clearExtraBits()
 
 {.pop.}
-# Division & Modulo
-# --------------------------------------------------------
 
 # Exponentiation
 # --------------------------------------------------------
+
+{.push raises: [], noInit, gcsafe.}
+
+func pow*(a: Stuint, e: Natural): Stuint =
+  ## Compute ``a`` to the power of ``e``,
+  ## ``e`` must be non-negative
+
+  # Implementation uses exponentiation by squaring
+  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
+  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
+
+  var (a, e) = (a, e)
+  result.setOne()
+
+  while true:
+    if bool(e and 1): # if y is odd
+      result = result * a
+    e = e shr 1
+    if e == 0:
+      break
+    a = a * a
+
+func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
+  ## Compute ``x`` to the power of ``y``,
+  ## ``x`` must be non-negative
+
+  # Implementation uses exponentiation by squaring
+  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
+  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
+
+  var (a, e) = (a, e)
+  result.setOne()
+
+  while true:
+    if e.isOdd:
+      result = result * a
+    e = e shr 1
+    if e.isZero:
+      break
+    a = a * a
+
+{.pop.}
+
+
+# Division & Modulo
+# --------------------------------------------------------
diff --git a/tests/test_uint_muldiv.nim b/tests/test_uint_divmod.nim
similarity index 76%
rename from tests/test_uint_muldiv.nim
rename to tests/test_uint_divmod.nim
index c45405d..b210996 100644
--- a/tests/test_uint_muldiv.nim
+++ b/tests/test_uint_divmod.nim
@@ -9,9 +9,6 @@
 
 import ../stint, unittest, test_helpers
 
-template chkMul(chk: untyped, a, b, c: string, bits: int) =
-  chk (fromHex(StUint[bits], a) * fromHex(StUint[bits], b)) == fromHex(StUint[bits], c)
-
 template chkDiv(chk: untyped, a, b, c: string, bits: int) =
   chk (fromHex(StUint[bits], a) div fromHex(StUint[bits], b)) == fromHex(StUint[bits], c)
 
@@ -21,41 +18,7 @@ template chkMod(chk: untyped, a, b, c: string, bits: int) =
 template chkDivMod(chk: untyped, a, b, c, d: string, bits: int) =
   chk divmod(fromHex(StUint[bits], a), fromHex(StUint[bits], b)) == (fromHex(StUint[bits], c), fromHex(StUint[bits], d))
 
-template testMuldiv(chk, tst: untyped) =
-  tst "operator `mul`":
-    chkMul(chk, "0", "3", "0", 8)
-    chkMul(chk, "1", "3", "3", 8)
-    chkMul(chk, "64", "3", "2C", 8) # overflow
-
-    chkMul(chk, "0", "3", "0", 16)
-    chkMul(chk, "1", "3", "3", 16)
-    chkMul(chk, "64", "3", "12C", 16)
-    chkMul(chk, "1770", "46", "68A0", 16) # overflow
-
-    chkMul(chk, "0", "3", "0", 32)
-    chkMul(chk, "1", "3", "3", 32)
-    chkMul(chk, "64", "3", "12C", 32)
-    chkMul(chk, "1770", "46", "668A0", 32)
-    chkMul(chk, "13880", "13880", "7D784000", 32) # overflow
-
-    chkMul(chk, "0", "3", "0", 64)
-    chkMul(chk, "1", "3", "3", 64)
-    chkMul(chk, "64", "3", "12C", 64)
-    chkMul(chk, "1770", "46", "668A0", 64)
-    chkMul(chk, "13880", "13880", "17D784000", 64)
-    chkMul(chk, "3B9ACA00", "E8D4A51000", "35C9ADC5DEA00000", 64) # overflow
-
-    chkMul(chk, "0", "3", "0", 128)
-    chkMul(chk, "1", "3", "3", 128)
-    chkMul(chk, "64", "3", "12C", 128)
-    chkMul(chk, "1770", "46", "668A0", 128)
-    chkMul(chk, "13880", "13880", "17D784000", 128)
-    chkMul(chk, "3B9ACA00", "E8D4A51000", "3635C9ADC5DEA00000", 128)
-    chkMul(chk, "25295F0D1", "10", "25295F0D10", 128)
-    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "4b66dc33f6acdca5e20890f2a5210000", 128) # overflow
-
-    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "14b66dc33f6acdca5e20890f2a5210000", 256)
-
+template testdivmod(chk, tst: untyped) =
   tst "operator `div`":
     chkDiv(chk, "0", "3", "0", 8)
     chkDiv(chk, "1", "3", "0", 8)
@@ -212,44 +175,10 @@ template testMuldiv(chk, tst: untyped) =
     chkDivMod(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "27", "6906906906906906906906906906906", "15", 128)
 
 static:
-  testMuldiv(ctCheck, ctTest)
+  testdivmod(ctCheck, ctTest)
 
 suite "Wider unsigned int muldiv coverage":
-  testMuldiv(check, test)
-
-suite "Testing unsigned int multiplication implementation":
-  test "Multiplication with result fitting in low half":
-
-    let a = 10000.stuint(64)
-    let b = 10000.stuint(64)
-
-    check: cast[uint64](a*b) == 100_000_000'u64 # need 27-bits
-
-  test "Multiplication with result overflowing low half":
-
-    let a = 1_000_000.stuint(64)
-    let b = 1_000_000.stuint(64)
-
-    check: cast[uint64](a*b) == 1_000_000_000_000'u64 # need 40 bits
-
-  test "Full overflow is handled like native unsigned types":
-
-    let a = 1_000_000_000.stuint(64)
-    let b = 1_000_000_000.stuint(64)
-    let c = 1_000.stuint(64)
-
-    let x = 1_000_000_000'u64
-    let y = 1_000_000_000'u64
-    let z = 1_000'u64
-    let w = x*y*z
-
-    #check: cast[uint64](a*b*c) == 1_000_000_000_000_000_000_000'u64 # need 70-bits
-    check: cast[uint64](a*b*c) == w
-
-  test "Nim v1.0.2 32 bit type inference rule changed":
-    let x = 9975492817.stuint(256)
-    let y = 16.stuint(256)
-    check x * y == 159607885072.stuint(256)
+  testdivmod(check, test)
 
 suite "Testing unsigned int division and modulo implementation":
   test "Divmod(100, 13) returns the correct result":
diff --git a/tests/test_uint_endians2.nim b/tests/test_uint_endians2.nim
index f8e19a6..580b37f 100644
--- a/tests/test_uint_endians2.nim
+++ b/tests/test_uint_endians2.nim
@@ -9,6 +9,7 @@
 
 import ../stint, unittest, stew/byteutils, test_helpers
 
+
 template chkSwapBytes(chk: untyped, bits: int, hex: string) =
   # dumpHex already do the job to swap the output if
   # we use `littleEndian` on both platform
diff --git a/tests/test_uint_mul.nim b/tests/test_uint_mul.nim
new file mode 100644
index 0000000..310987a
--- /dev/null
+++ b/tests/test_uint_mul.nim
@@ -0,0 +1,88 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../stint, unittest, test_helpers
+
+template chkMul(chk: untyped, a, b, c: string, bits: int) =
+  chk (fromHex(Stuint[bits], a) * fromHex(Stuint[bits], b)) == fromHex(Stuint[bits], c)
+
+template testMul(chk, tst: untyped) =
+  tst "operator `mul`":
+    chkMul(chk, "0", "3", "0", 8)
+    chkMul(chk, "1", "3", "3", 8)
+    chkMul(chk, "64", "3", "2C", 8) # overflow
+
+    chkMul(chk, "0", "3", "0", 16)
+    chkMul(chk, "1", "3", "3", 16)
+    chkMul(chk, "64", "3", "12C", 16)
+    chkMul(chk, "1770", "46", "68A0", 16) # overflow
+
+    chkMul(chk, "0", "3", "0", 32)
+    chkMul(chk, "1", "3", "3", 32)
+    chkMul(chk, "64", "3", "12C", 32)
+    chkMul(chk, "1770", "46", "668A0", 32)
+    chkMul(chk, "13880", "13880", "7D784000", 32) # overflow
+
+    chkMul(chk, "0", "3", "0", 64)
+    chkMul(chk, "1", "3", "3", 64)
+    chkMul(chk, "64", "3", "12C", 64)
+    chkMul(chk, "1770", "46", "668A0", 64)
+    chkMul(chk, "13880", "13880", "17D784000", 64)
+    chkMul(chk, "3B9ACA00", "E8D4A51000", "35C9ADC5DEA00000", 64) # overflow
+
+    chkMul(chk, "0", "3", "0", 128)
+    chkMul(chk, "1", "3", "3", 128)
+    chkMul(chk, "64", "3", "12C", 128)
+    chkMul(chk, "1770", "46", "668A0", 128)
+    chkMul(chk, "13880", "13880", "17D784000", 128)
+    chkMul(chk, "3B9ACA00", "E8D4A51000", "3635C9ADC5DEA00000", 128)
+    chkMul(chk, "25295F0D1", "10", "25295F0D10", 128)
+    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "4b66dc33f6acdca5e20890f2a5210000", 128) # overflow
+
+    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "14b66dc33f6acdca5e20890f2a5210000", 256)
+
+static:
+  testMul(ctCheck, ctTest)
+
+suite "Wider unsigned int muldiv coverage":
+  testMul(check, test)
+
+suite "Testing unsigned int multiplication implementation":
+  test "Multiplication with result fitting in low half":
+
+    let a = 10000.stuint(64)
+    let b = 10000.stuint(64)
+
+    check: cast[uint64](a*b) == 100_000_000'u64 # need 27-bits
+
+  test "Multiplication with result overflowing low half":
+
+    let a = 1_000_000.stuint(64)
+    let b = 1_000_000.stuint(64)
+
+    check: cast[uint64](a*b) == 1_000_000_000_000'u64 # need 40 bits
+
+  test "Full overflow is handled like native unsigned types":
+
+    let a = 1_000_000_000.stuint(64)
+    let b = 1_000_000_000.stuint(64)
+    let c = 1_000.stuint(64)
+
+    let x = 1_000_000_000'u64
+    let y = 1_000_000_000'u64
+    let z = 1_000'u64
+    let w = x*y*z
+
+    #check: cast[uint64](a*b*c) == 1_000_000_000_000_000_000_000'u64 # need 70-bits
+    check: cast[uint64](a*b*c) == w
+
+  test "Nim v1.0.2 32 bit type inference rule changed":
+    let x = 9975492817.stuint(256)
+    let y = 16.stuint(256)
+    check x * y == 159607885072.stuint(256)

From dd3ab71029682baaf0478f7ef1edd8f763f4f4e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sun, 21 Feb 2021 20:21:56 +0100
Subject: [PATCH 14/26] For division we need internal add/sub/shift/bitwise so
 create internal files [skip-ci]

---
 stint/private/datatypes.nim    |  10 +++
 stint/private/uint_addsub.nim  |  68 +++++++++++++++
 stint/private/uint_bitwise.nim |  86 ++++++++++++++++++
 stint/private/uint_shift.nim   |  41 ++++++++-
 stint/uintops.nim              | 153 ++++++---------------------------
 5 files changed, 228 insertions(+), 130 deletions(-)
 create mode 100644 stint/private/uint_addsub.nim
 create mode 100644 stint/private/uint_bitwise.nim

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 6f8bc7c..f1585e7 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -80,6 +80,16 @@ template mostSignificantWord*(a: SomeBigInteger): auto =
   else:
     a.limbs[0]
 
+template clearExtraBits*(a: var StUint) =
+  ## A Stuint is stored in an array of 32 of 64-bit word
+  ## If we do bit manipulation at the word level,
+  ## for example a 8-bit stuint stored in a 64-bit word
+  ## we need to clear the upper 56-bit
+  when a.bits != a.limbs.len * WordBitWidth:
+    const posExtraBits = a.bits - (a.limbs.len-1) * WordBitWidth
+    const mask = (Word(1) shl posExtraBits) - 1
+    mostSignificantWord(a) = mostSignificantWord(a) and mask
+
 # Iterations
 # --------------------------------------------------------
 
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
new file mode 100644
index 0000000..f037c1b
--- /dev/null
+++ b/stint/private/uint_addsub.nim
@@ -0,0 +1,68 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./datatypes,
+  ./primitives/addcarry_subborrow
+
+# Addsub
+# --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func sum*(r: var Stuint, a, b: Stuint) =
+  ## Addition for multi-precision unsigned int
+  var carry = Carry(0)
+  for wr, wa, wb in leastToMostSig(r, a, b):
+    addC(carry, wr, wa, wb, carry)
+  r.clearExtraBits()
+
+func `+=`*(a: var Stuint, b: Stuint) =
+  ## In-place addition for multi-precision unsigned int
+  var carry = Carry(0)
+  for wa, wb in leastToMostSig(a, b):
+    addC(carry, wa, wa, wb, carry)
+  a.clearExtraBits()
+
+func diff*(r: var Stuint, a, b: Stuint) =
+  ## Substraction for multi-precision unsigned int
+  var borrow = Borrow(0)
+  for wr, wa, wb in leastToMostSig(r, a, b):
+    subB(borrow, wr, wa, wb, borrow)
+  r.clearExtraBits()
+
+func `-=`*(a: var Stuint, b: Stuint) =
+  ## In-place substraction for multi-precision unsigned int
+  var borrow = Borrow(0)
+  for wa, wb in leastToMostSig(a, b):
+    subB(borrow, wa, wa, wb, borrow)
+  a.clearExtraBits()
+
+func inc*(a: var Stuint, w: Word = 1) =
+  var carry = Carry(0)
+  when cpuEndian == littleEndian:
+    addC(carry, a.limbs[0], a.limbs[0], w, carry)
+    for i in 1 ..< a.limbs.len:
+      addC(carry, a.limbs[i], a.limbs[i], 0, carry)
+  else:
+    {.error: "Not implemented.".}
+  a.clearExtraBits()
+
+func sum*(r: var Stuint, a: Stuint, b: SomeUnsignedInt) =
+  ## Addition for multi-precision unsigned int
+  ## with an unsigned integer
+  r = a
+  r.inc(Word(b))
+
+func `+=`*(a: var Stuint, b: SomeUnsignedInt) =
+  ## In-place addition for multi-precision unsigned int
+  ## with an unsigned integer
+  a.inc(Word(b))
diff --git a/stint/private/uint_bitwise.nim b/stint/private/uint_bitwise.nim
new file mode 100644
index 0000000..587b7a4
--- /dev/null
+++ b/stint/private/uint_bitwise.nim
@@ -0,0 +1,86 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./datatypes
+
+# Bitwise operations
+# --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func bitnot*(r: var StUint, a: Stuint) =
+  ## Bitwise complement of unsigned integer a
+  ## i.e. flips all bits of the input
+  for wr, wa in leastToMostSig(r, a):
+    wr = not wa
+  r.clearExtraBits()
+
+func bitor*(r: var Stuint, a, b: Stuint) =
+  ## `Bitwise or` of numbers a and b
+  for wr, wa, wb in leastToMostSig(r, a, b):
+    wr = wa or wb
+
+func bitand*(r: var Stuint, a, b: Stuint) =
+  ## `Bitwise and` of numbers a and b
+  for wr, wa, wb in leastToMostSig(r, a, b):
+    wr = wa and wb
+
+func bitxor*(r: var Stuint, a, b: Stuint) =
+  ## `Bitwise xor` of numbers x and y
+  for wr, wa, wb in leastToMostSig(r, a, b):
+    wr = wa xor wb
+  r.clearExtraBits()
+
+func countOnes*(a: Stuint): int =
+  result = 0
+  for wa in leastToMostSig(a):
+    result += countOnes(wa)
+
+func parity*(a: Stuint): int =
+  result = parity(a.limbs[0])
+  for i in 1 ..< a.limbs.len:
+    result = result xor parity(a.limbs[i])
+
+func leadingZeros*(a: Stuint): int =
+  result = 0
+
+  # Adjust when we use only part of the word size
+  var extraBits = WordBitWidth * a.limbs.len - a.bits
+
+  for word in mostToLeastSig(a):
+    let zeroCount = word.leadingZeros()
+    if extraBits > 0:
+      result += zeroCount - min(extraBits, WordBitWidth)
+      extraBits -= WordBitWidth
+    else:
+      result += zeroCount
+    if zeroCount != WordBitWidth:
+      break
+
+func trailingZeros*(a: Stuint): int =
+  result = 0
+  for word in leastToMostSig(a):
+    let zeroCount = word.trailingZeros()
+    result += zeroCount
+    if zeroCount != WordBitWidth:
+      break
+
+  when a.limbs.len * WordBitWidth != a.bits:
+    if result > a.bits:
+      result = a.bits
+
+func firstOne*(a: Stuint): int =
+  result = trailingZeros(a)
+  if result == a.limbs.len * WordBitWidth:
+    result = 0
+  else:
+    result += 1
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index 3bff3d6..13ff418 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -8,8 +8,11 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
+  # Status lib
+  stew/bitops2,
+  # Internal
   ./datatypes
-
+  
 # Shifts
 # --------------------------------------------------------
 {.push raises: [], gcsafe.}
@@ -91,3 +94,39 @@ func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   else:
     for i in countdown(Limbs.len-1, 0):
       r[i] = a[i-w]
+
+# Wrappers
+# --------------------------------------------------------
+
+func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
+  ## Shift `a` right by k bits and store in `r`
+  if k < WordBitWidth:
+    r.limbs.shrSmall(a.limbs, k)
+    return
+
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    r.limbs.shrWords(a.limbs, w)
+  else:
+    r.limbs.shrLarge(a.limbs, w, shift)
+
+func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
+  ## Shift `a` left by k bits and store in `r`
+  if k < WordBitWidth:
+    r.limbs.shlSmall(a.limbs, k)
+    r.clearExtraBits()
+    return
+
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    r.limbs.shlWords(a.limbs, w)
+  else:
+    r.limbs.shlLarge(a.limbs, w, shift)
+
+  r.clearExtraBits()
diff --git a/stint/uintops.nim b/stint/uintops.nim
index a96bf1d..e89987c 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018-2020 Status Research & Development GmbH
+# Copyright 2018-Present Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -8,11 +8,11 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  # Status lib
-  stew/bitops2,
   # Internal
   ./private/datatypes,
+  ./private/uint_bitwise,
   ./private/uint_shift,
+  ./private/uint_addsub,
   ./private/primitives/addcarry_subborrow
 
 export StUint
@@ -99,119 +99,41 @@ func isEven*(a: Stuint): bool {.inline.} =
 # --------------------------------------------------------
 {.push raises: [], inline, noInit, gcsafe.}
 
-template clearExtraBits(a: var StUint) =
-  ## A Stuint is stored in an array of 32 of 64-bit word
-  ## If we do bit manipulation at the word level,
-  ## for example a 8-bit stuint stored in a 64-bit word
-  ## we need to clear the upper 56-bit
-  when a.bits != a.limbs.len * WordBitWidth:
-    const posExtraBits = a.bits - (a.limbs.len-1) * WordBitWidth
-    const mask = (Word(1) shl posExtraBits) - 1
-    mostSignificantWord(a) = mostSignificantWord(a) and mask
-
 func `not`*(a: Stuint): Stuint =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
-  for wr, wa in leastToMostSig(result, a):
-    wr = not wa
-  result.clearExtraBits()
+  result.bitnot(a)
 
 func `or`*(a, b: Stuint): Stuint =
   ## `Bitwise or` of numbers a and b
-  for wr, wa, wb in leastToMostSig(result, a, b):
-    wr = wa or wb
+  result.bitor(a, b)
 
 func `and`*(a, b: Stuint): Stuint =
   ## `Bitwise and` of numbers a and b
-  for wr, wa, wb in leastToMostSig(result, a, b):
-    wr = wa and wb
+  result.bitand(a, b)
 
 func `xor`*(a, b: Stuint): Stuint =
   ## `Bitwise xor` of numbers x and y
-  for wr, wa, wb in leastToMostSig(result, a, b):
-    wr = wa xor wb
-  result.clearExtraBits()
-
-func countOnes*(a: Stuint): int =
-  result = 0
-  for wa in leastToMostSig(a):
-    result += countOnes(wa)
-
-func parity*(a: Stuint): int =
-  result = parity(a.limbs[0])
-  for i in 1 ..< a.limbs.len:
-    result = result xor parity(a.limbs[i])
-
-func leadingZeros*(a: Stuint): int =
-  result = 0
-
-  # Adjust when we use only part of the word size
-  var extraBits = WordBitWidth * a.limbs.len - a.bits
-
-  for word in mostToLeastSig(a):
-    let zeroCount = word.leadingZeros()
-    if extraBits > 0:
-      result += zeroCount - min(extraBits, WordBitWidth)
-      extraBits -= WordBitWidth
-    else:
-      result += zeroCount
-    if zeroCount != WordBitWidth:
-      break
-
-func trailingZeros*(a: Stuint): int =
-  result = 0
-  for word in leastToMostSig(a):
-    let zeroCount = word.trailingZeros()
-    result += zeroCount
-    if zeroCount != WordBitWidth:
-      break
+  result.bitxor(a, b)
 
-  when a.limbs.len * WordBitWidth != a.bits:
-    if result > a.bits:
-      result = a.bits
+{.pop.} # End noInit
 
-func firstOne*(a: Stuint): int =
-  result = trailingZeros(a)
-  if result == a.limbs.len * WordBitWidth:
-    result = 0
-  else:
-    result += 1
+export
+  countOnes,
+  parity,
+  leadingZeros,
+  trailingZeros,
+  firstOne
 
-{.pop.} # End noInit
 {.push raises: [], inline, gcsafe.}
 
 func `shr`*(a: Stuint, k: SomeInteger): Stuint =
   ## Shift right by k bits
-  if k < WordBitWidth:
-    result.limbs.shrSmall(a.limbs, k)
-    return
-
-  # w = k div WordBitWidth, shift = k mod WordBitWidth
-  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
-  let shift = k and (WordBitWidth - 1)
-
-  if shift == 0:
-    result.limbs.shrWords(a.limbs, w)
-  else:
-    result.limbs.shrLarge(a.limbs, w, shift)
+  result.shiftRight(a, k)
 
 func `shl`*(a: Stuint, k: SomeInteger): Stuint =
   ## Shift left by k bits
-  if k < WordBitWidth:
-    result.limbs.shlSmall(a.limbs, k)
-    result.clearExtraBits()
-    return
-
-  # w = k div WordBitWidth, shift = k mod WordBitWidth
-  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
-  let shift = k and (WordBitWidth - 1)
-
-  if shift == 0:
-    result.limbs.shlWords(a.limbs, w)
-  else:
-    result.limbs.shlLarge(a.limbs, w, shift)
-
-  result.clearExtraBits()
+  result.shiftLeft(a, k)
 
 {.pop.}
 
@@ -221,52 +143,27 @@ func `shl`*(a: Stuint, k: SomeInteger): Stuint =
 
 func `+`*(a, b: Stuint): Stuint =
   ## Addition for multi-precision unsigned int
-  var carry = Carry(0)
-  for wr, wa, wb in leastToMostSig(result, a, b):
-    addC(carry, wr, wa, wb, carry)
-  result.clearExtraBits()
+  result.sum(a, b)
 
-func `+=`*(a: var Stuint, b: Stuint) =
-  ## In-place addition for multi-precision unsigned int
-  var carry = Carry(0)
-  for wa, wb in leastToMostSig(a, b):
-    addC(carry, wa, wa, wb, carry)
-  a.clearExtraBits()
+export `+=`
 
 func `-`*(a, b: Stuint): Stuint =
   ## Substraction for multi-precision unsigned int
-  var borrow = Borrow(0)
-  for wr, wa, wb in leastToMostSig(result, a, b):
-    subB(borrow, wr, wa, wb, borrow)
-  result.clearExtraBits()
+  result.diff(a, b)
 
-func `-=`*(a: var Stuint, b: Stuint) =
-  ## In-place substraction for multi-precision unsigned int
-  var borrow = Borrow(0)
-  for wa, wb in leastToMostSig(a, b):
-    subB(borrow, wa, wa, wb, borrow)
-  a.clearExtraBits()
+export `-=`
 
-func inc*(a: var Stuint, w: Word = 1) =
-  var carry = Carry(0)
-  when cpuEndian == littleEndian:
-    addC(carry, a.limbs[0], a.limbs[0], w, carry)
-    for i in 1 ..< a.limbs.len:
-      addC(carry, a.limbs[i], a.limbs[i], 0, carry)
-  a.clearExtraBits()
+export inc
 
 func `+`*(a: Stuint, b: SomeUnsignedInt): Stuint =
   ## Addition for multi-precision unsigned int
   ## with an unsigned integer
-  result = a
-  result.inc(Word(b))
+  result.sum(a, Word(b))
 
-func `+=`*(a: var Stuint, b: SomeUnsignedInt) =
-  ## In-place addition for multi-precision unsigned int
-  ## with an unsigned integer
-  a.inc(Word(b))
+export `+=`
 
 {.pop.}
+
 # Multiplication
 # --------------------------------------------------------
 # Multiplication is implemented in a separate file at the limb-level
@@ -311,7 +208,6 @@ func pow*(a: Stuint, e: Natural): Stuint =
 func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
   ## Compute ``x`` to the power of ``y``,
   ## ``x`` must be non-negative
-
   # Implementation uses exponentiation by squaring
   # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
   # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
@@ -329,6 +225,5 @@ func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
 
 {.pop.}
 
-
 # Division & Modulo
 # --------------------------------------------------------

From f952314c21f9fb8be28b1df3cd20e33118e38b4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Fri, 6 Aug 2021 14:44:25 +0200
Subject: [PATCH 15/26] dump progress

---
 stint/private/datatypes.nim  |  22 ++-
 stint/private/uint_div.nim   | 344 ++++++++++-------------------------
 stint/private/uint_shift.nim |  25 ++-
 3 files changed, 146 insertions(+), 245 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index f1585e7..0e87408 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -126,7 +126,7 @@ iterator leastToMostSig*[aBits, bBits](a: var SomeBigInteger[aBits], b: SomeBigI
     for i in 0 ..< min(a.limbs.len, b.limbs.len):
       yield (a.limbs[i], b.limbs[i])
   else:
-    for i in countdown(min(aLimbs.len, b.limbs.len)-1, 0):
+    for i in countdown(min(a.limbs.len, b.limbs.len)-1, 0):
       yield (a.limbs[i], b.limbs[i])
 
 iterator leastToMostSig*(c: var SomeBigInteger, a, b: SomeBigInteger): (var Word, Word, Word) =
@@ -176,3 +176,23 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
       ident("unrolledIter_" & $idx & $i),
       body.replaceNodes(idx, newLit i)
     )
+
+# Copy
+# --------------------------------------------------------
+
+func copyFrom*[dLen, sLen](
+        dst: var SomeBigInteger[dLen],
+        src: SomeBigInteger[sLen]
+      ){.inline.} =
+  ## Copy a BigInteger, truncated to 2^slen if the source
+  ## is larger than the destination
+  when cpuEndian == littleEndian:
+    for i in 0 ..< min(dst.limbs.len, src.limbs.len):
+      dst.limbs[i] = src.limbs[i]
+    for i in src.limbs.len ..< dst.limbs.len:
+      dst.limbs[i] = 0
+  else:
+    for i in countdown(dst.limbs.len-1, src.limbs.len):
+      dst.limbs[i] = 0
+    for i in countdown(src.limbs.len-1, 0):
+      dst.limbs[i] = src.limbs[i]
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index 8afe840..fb62b73 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -7,206 +7,42 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./bitops2_priv, ./conversion, ./initialization,
-        ./datatypes,
-        ./uint_comparison,
-        ./uint_bitwise_ops,
-        ./uint_addsub,
-        ./uint_mul
-
-# ################### Division ################### #
-# We use the following algorithm:
-#  - Fast recursive division by Burnikel and Ziegler
-
-###################################################################################################################
-##                                                                                                               ##
-##  Grade school division, but with (very) large digits, dividing [a1,a2,a3,a4] by [b1,b2]:                      ##
-##                                                                                                               ##
-##    +----+----+----+----+     +----+----+   +----+                                                             ##
-##    | a1 | a2 | a3 | a4 |  /  | b1 | b2 | = | q1 |        DivideThreeHalvesByTwo(a1a2, a3, b1b2, n, q1, r1r2)  ##
-##    +----+----+----+----+     +----+----+   +----+                                                             ##
-##    +--------------+  |                       |                                                                ##
-##    |   b1b2 * q1  |  |                       |                                                                ##
-##    +--------------+  |                       |                                                                ##
-##  - ================  v                       |                                                                ##
-##         +----+----+----+     +----+----+     |  +----+                                                        ##
-##         | r1 | r2 | a4 |  /  | b1 | b2 | =   |  | q2 |   DivideThreeHalvesByTwo(r1r2, a4, b1b2, n, q1, r1r2)  ##
-##         +----+----+----+     +----+----+     |  +----+                                                        ##
-##         +--------------+                     |    |                                                           ##
-##         |   b1b2 * q2  |                     |    |                                                           ##
-##         +--------------+                     |    |                                                           ##
-##       - ================                     v    v                                                           ##
-##              +----+----+                   +----+----+                                                        ##
-##              | r1 | r2 |                   | q1 | q2 |   r1r2 = a1a2a3a4 mod b1b2, q1q2 = a1a2a3a4 div b1b2   ##
-##              +----+----+                   +----+----+ ,                                                      ##
-##                                                                                                               ##
-##  Note: in the diagram above, a1, b1, q1, r1 etc. are the most significant "digits" of their numbers.          ##
-##                                                                                                               ##
-###################################################################################################################
-
-func div2n1n[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T)
-func div2n1n(q, r: var UintImpl, ah, al, b: UintImpl)
-  # Forward declaration
-
-func divmod*(x, y: SomeUnsignedInt): tuple[quot, rem: SomeUnsignedInt] {.inline.}=
-  # hopefully the compiler fuse that in a single op
-  (x div y, x mod y)
-
-func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]
-  # Forward declaration
-
-func div3n2n[T]( q: var UintImpl[T],
-              r: var UintImpl[UintImpl[T]],
-              a2, a1, a0: UintImpl[T],
-              b: UintImpl[UintImpl[T]]) =
-
-  var
-    c: UintImpl[T]
-    d: UintImpl[UintImpl[T]]
-    carry: bool
-
-  if a2 < b.hi:
-    div2n1n(q, c, a2, a1, b.hi)
-  else:
-    q = zero(type q) - one(type q) # We want 0xFFFFF ....
-    c = a1 + b.hi
-    if c < a1:
-      carry = true
-
-  extPrecMul[T](d, q, b.lo)
-  let ca0 = UintImpl[type c](hi: c, lo: a0)
-
-  r = ca0 - d
-
-  if (not carry) and (d > ca0):
-    q -= one(type q)
-    r += b
-
-    # if there was no carry
-    if r > b:
-      q -= one(type q)
-      r += b
-
-proc div3n2n[T: SomeUnsignedInt](
-              q: var T,
-              r: var UintImpl[T],
-              a2, a1, a0: T,
-              b: UintImpl[T]) =
-
-  var
-    c: T
-    d: UintImpl[T]
-    carry: bool
-
-  if a2 < b.hi:
-    div2n1n(q, c, a2, a1, b.hi)
-
-  else:
-    q = 0.T - 1.T # We want 0xFFFFF ....
-    c = a1 + b.hi
-    if c < a1:
-      carry = true
-
-  extPrecMul[T](d, q, b.lo)
-  let ca0 = UintImpl[T](hi: c, lo: a0)
-  r = ca0 - d
-
-  if (not carry) and d > ca0:
-    dec q
-    r += b
-
-    # if there was no carry
-    if r > b:
-      dec q
-      r += b
-
-func div2n1n(q, r: var UintImpl, ah, al, b: UintImpl) =
-
-  # doAssert leadingZeros(b) == 0, "Divisor was not normalized"
-
-  var s: UintImpl
-  div3n2n(q.hi, s, ah.hi, ah.lo, al.hi, b)
-  div3n2n(q.lo, r, s.hi, s.lo, al.lo, b)
-
-func div2n1n[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T) =
-
-  # doAssert leadingZeros(d) == 0, "Divisor was not normalized"
-
-  const
-    size = bitsof(q)
-    halfSize = size div 2
-    halfMask = (1.T shl halfSize) - 1.T
-
-  template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
-
-    var (q, r) = divmod(n_hi, d_hi)
-    let m = q * d_lo
-    r = (r shl halfSize) or n_lo
-
-    # Fix the reminder, we're at most 2 iterations off
-    if r < m:
-      dec q
-      r += d
-      if r >= d and r < m:
-        dec q
-        r += d
-    r -= m
-    (q, r)
-
-  let
-    d_hi = d shr halfSize
-    d_lo = d and halfMask
-    n_lohi = n_lo shr halfSize
-    n_lolo = n_lo and halfMask
-
-  # First half of the quotient
-  let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
-
-  # Second half
-  let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
-
-  q = (q1 shl halfSize) or q2
-  r = r2
-
-func divmodBZ[T](x, y: UintImpl[T], q, r: var UintImpl[T])=
-
-  doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
-
-  if y.hi.isZero:
-    # Shortcut if divisor is smaller than half the size of the type
-    if x.hi < y.lo:
-      # Normalize
-      let
-        clz = leadingZeros(y.lo)
-        xx = x shl clz
-        yy = y.lo shl clz
-
-      # If y is smaller than the base, normalizing x does not overflow.
-      # Compute directly the low part
-      div2n1n(q.lo, r.lo, xx.hi, xx.lo, yy)
-      # Undo normalization
-      r.lo = r.lo shr clz
-      return
-
-  # General case
-
-  # Normalization
-  let clz = leadingZeros(y)
-
-  let
-    xx = UintImpl[type x](lo: x) shl clz
-    yy = y shl clz
-
-  # Compute
-  div2n1n(q, r, xx.hi, xx.lo, yy)
-
-  # Undo normalization
-  r = r shr clz
-
-func divmodBS(x, y: UintImpl, q, r: var UintImpl) =
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./datatypes,
+  ./uint_bitwise,
+  ./uint_shift
+
+# Division
+# --------------------------------------------------------
+
+func shortDiv*(a: var Limbs, k: Word): Word =
+  ## Divide `a` by k in-place and return the remainder
+  result = Word(0)
+
+  let clz = leadingZeros(k)
+  let normK = k shl clz
+
+  for i in countdown(a.len-1, 0):
+    # dividend = 2^64 * remainder + a[i]
+    var hi = result
+    var lo = a[i]
+    # Normalize, shifting the remainder by clz(k) cannot overflow.
+    hi = (hi shl clz) or (lo shr (WordBitWidth - clz))
+    lo = lo shl clz
+    div2n1n(a[i], result, hi, lo, normK)
+    # Undo normalization
+    result = result shr clz
+
+func binaryShiftDiv[qLen, rLen, uLen, vLen: static int](
+       q: var Limbs[qLen],
+       r: var Limbs[rLen],
+       u: Limbs[uLen],
+       v: Limbs[vLen]) =
   ## Division for multi-precision unsigned uint
   ## Implementation through binary shift division
-
   doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
 
   type SubTy = type x.lo
@@ -226,16 +62,79 @@ func divmodBS(x, y: UintImpl, q, r: var UintImpl) =
     d = d shr 1
     dec(shift)
 
+func knuthDivLE[qLen, rLen, uLen, vLen: static int](
+       q: var Limbs[qLen],
+       r: var Limbs[rLen],
+       u: Limbs[uLen],
+       v: Limbs[vLen],
+       needRemainder: bool) =
+  ## Compute the quotient and remainder (if needed)
+  ## of the division of u by v
+  ##
+  ## - q must be of size uLen - vLen + 1 (assuming u and v uses all words)
+  ## - r must be of size vLen (assuming v uses all words)
+  ## - uLen >= vLen
+  ##
+  ## Knuth Division
+  ## - Knuth's "Algorithm D", The Art of Computer Programming, 1998
+  ## - Warren, Hacker's Delight, 2013
+  ##
+  ## For now only LittleEndian is implemented
+
+  # Find the most significant word with actual set bits
+  # and get the leading zero count there
+  var divisorLen = vLen
+  var clz: int
+  for w in mostToLeastSig(v):
+    if w != 0:
+      clz = leadingZeros(w)
+      break
+    else:
+      divisorLen -= 1
+
+  doAssert msw != 0, "Division by zero. Abandon ship!"
+
+  if mswLen == 1:
+    q.copyFrom(u)
+    r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord())
+    # zero all but the least significant word
+    var lsw = true
+    for w in leastToMostSig(r):
+      if lsw:
+        lsw = false
+      else:
+        w = 0
+    return
+
+  var un {.noInit.}: Limbs[uLen+1]
+  var vn {.noInit.}: Limbs[vLen] # [mswLen .. vLen] range is unused
+
+  # Normalize so that the divisor MSB is set,
+  # vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
+  un.shlSmallOverflowing(u, clz)
+  vn.shlSmall(v, clz)
+
+  static: doAssert cpuEndian == littleEndian, "As it is the division algorithm requires little endian ordering of the limbs".
+  # TODO: is it worth it to have the uint be the exact same extended precision representation
+  # as a wide int (say uint128 or uint256)?
+  # in big-endian, the following loop must go the other way and the -1 must be +1
+  for j in countdown(uLen - divisorLen, 0, 1):
+    # Compute qhat estimate of q[j] (off by 0, 1 and rarely 2)
+    var qhat, rhat: Word
+    let hi = un[j+divisorLen]
+    let lo = un[j+divisorLen-1]
+    div2n1n(qhat, rhat, hi, lo, vn[divisorLen-1])
+
+
 const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
                                 # binary shift is probably faster
 
 func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]=
 
-  let x_clz = x.leadingZeros
-  let y_clz = y.leadingZeros
+  let x_clz = x.leadingZeros()
+  let y_clz = y.leadingZeros()
 
   # We short-circuit division depending on special-cases.
-  # TODO: Constant-time division
   if unlikely(y.isZero):
     raise newException(DivByZeroDefect, "You attempted to divide by zero")
   elif y_clz == (bitsof(y) - 1):
@@ -248,7 +147,6 @@ func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]=
     # y is a power of 2. (this also matches 0 but it was eliminated earlier)
     # TODO. Would it be faster to use countTrailingZero (ctz) + clz == size(y) - 1?
     #       Especially because we shift by ctz after.
-    #       It is a bit tricky with recursive types. An empty n.lo means 0 or sizeof(n.lo)
     let y_ctz = bitsof(y) - y_clz - 1
     result.quot = x shr y_ctz
     result.rem = x and (y - one(type y))
@@ -257,7 +155,7 @@ func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]=
   elif x < y:
     result.rem = x
   elif (y_clz - x_clz) < BinaryShiftThreshold:
-    divmodBS(x, y, result.quot, result.rem)
+    binaryShiftDiv(x, y, result.quot, result.rem)
   else:
     divmodBZ(x, y, result.quot, result.rem)
 
@@ -268,43 +166,3 @@ func `div`*(x, y: UintImpl): UintImpl {.inline.} =
 func `mod`*(x, y: UintImpl): UintImpl {.inline.} =
   ## Division operation for multi-precision unsigned uint
   divmod(x,y).rem
-
-
-# ######################################################################
-# Division implementations
-#
-# Division is the most costly operation
-# And also of critical importance for cryptography application
-
-# ##### Research #####
-
-# Overview of division algorithms:
-# - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms
-# - https://gmplib.org/~tege/division-paper.pdf
-# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf
-
-# Libdivide has an implementations faster than hardware if dividing by the same number is needed
-# - http://libdivide.com/documentation.html
-# - https://github.com/ridiculousfish/libdivide/blob/master/libdivide.h
-# Furthermore libdivide also has branchless implementations
-
-# Implementation: we use recursive fast division by Burnikel and Ziegler.
-#
-# It is build upon divide and conquer algorithm that can be found in:
-# - Hacker's delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt
-# - Libdivide
-# - Code project: https://www.codeproject.com/Tips/785014/UInt-Division-Modulus
-# - Cuda-uint128 (unfinished): https://github.com/curtisseizert/CUDA-uint128/blob/master/cuda_uint128.h
-# - Mpdecimal: https://github.com/status-im/nim-decimal/blob/9b65e95299cb582b14e0ae9a656984a2ce0bab03/decimal/mpdecimal_wrapper/generated/basearith.c#L305-L412
-
-# Description of recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
-#  - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
-#  - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
-#  - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
-
-# Other libraries that can be used as reference for alternative (?) implementations:
-# - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383
-# - LibTomMath: https://github.com/libtom/libtommath
-# - Google Abseil: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric
-# - Crypto libraries like libsecp256k1, OpenSSL, ... though they are not generics. (uint256 only for example)
-# Note: GMP/MPFR are GPL. The papers can be used but not their code.
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index 13ff418..a1441b0 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -12,7 +12,7 @@ import
   stew/bitops2,
   # Internal
   ./datatypes
-  
+
 # Shifts
 # --------------------------------------------------------
 {.push raises: [], gcsafe.}
@@ -58,6 +58,29 @@ func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
     for i in countdown(Limbs.len-w, 0):
       r[i] = a[i+w]
 
+func shlSmallOverflowing*[rLen, aLen: static int](
+       r: var Limbs[rLen], a: Limbs[aLen], k: SomeInteger) =
+  ## Compute the `shift left` operation of x and k
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  when cpuEndian == littleEndian:
+    r[0] = a[0] shl k
+    for i in 1 ..< a.len:
+      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
+    if rLen > aLen:
+      r[aLen] = a[aLen - 1] shr (WordBitWidth - k)
+      for i in aLen+1 ..< rLen:
+        r[i] = 0
+  else:
+    const offset = rLen - aLen
+    r[^1] = a[^1] shl k
+    for i in countdown(a.len-2, 0):
+      r[i+offset] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
+    if rLen > aLen:
+      r[offset-1] = a[0] shr (WordBitWidth - k)
+      for i in 0 ..< offset-1:
+        r[i] = 0
+
 func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   ## Compute the `shift left` operation of x and k
   ##

From c2ed8a4bc2eb51608e74f6c1ade077e14787843f Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Wed, 12 Jan 2022 18:25:55 +0100
Subject: [PATCH 16/26] stash div refactor

---
 stint.nim                  |   4 +-
 stint/private/uint_div.nim | 218 ++++++++++++++++++++++++++++---------
 stint/uintops.nim          |   5 +-
 3 files changed, 170 insertions(+), 57 deletions(-)

diff --git a/stint.nim b/stint.nim
index 2631d54..b6c02e7 100644
--- a/stint.nim
+++ b/stint.nim
@@ -10,8 +10,8 @@
 # import stint/[bitops2, endians2, intops, io, modular_arithmetic, literals_stint]
 # export bitops2, endians2, intops, io, modular_arithmetic, literals_stint
 
-import stint/[io, uintops, bitops2]
-export io, uintops, bitops2
+import stint/[io, uintops]
+export io, uintops
 
 type
   # Int128* = Stint[128]
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index fb62b73..e3bfaed 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -13,7 +13,8 @@ import
   # Internal
   ./datatypes,
   ./uint_bitwise,
-  ./uint_shift
+  ./uint_shift,
+  ./primitives/[addcarry_subborrow, extended_precision]
 
 # Division
 # --------------------------------------------------------
@@ -36,31 +37,31 @@ func shortDiv*(a: var Limbs, k: Word): Word =
     # Undo normalization
     result = result shr clz
 
-func binaryShiftDiv[qLen, rLen, uLen, vLen: static int](
-       q: var Limbs[qLen],
-       r: var Limbs[rLen],
-       u: Limbs[uLen],
-       v: Limbs[vLen]) =
-  ## Division for multi-precision unsigned uint
-  ## Implementation through binary shift division
-  doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
+# func binaryShiftDiv[qLen, rLen, uLen, vLen: static int](
+#        q: var Limbs[qLen],
+#        r: var Limbs[rLen],
+#        u: Limbs[uLen],
+#        v: Limbs[vLen]) =
+#   ## Division for multi-precision unsigned uint
+#   ## Implementation through binary shift division
+#   doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
 
-  type SubTy = type x.lo
+#   type SubTy = type x.lo
 
-  var
-    shift = y.leadingZeros - x.leadingZeros
-    d = y shl shift
+#   var
+#     shift = y.leadingZeros - x.leadingZeros
+#     d = y shl shift
 
-  r = x
+#   r = x
 
-  while shift >= 0:
-    q += q
-    if r >= d:
-      r -= d
-      q.lo = q.lo or one(SubTy)
+#   while shift >= 0:
+#     q += q
+#     if r >= d:
+#       r -= d
+#       q.lo = q.lo or one(SubTy)
 
-    d = d shr 1
-    dec(shift)
+#     d = d shr 1
+#     dec(shift)
 
 func knuthDivLE[qLen, rLen, uLen, vLen: static int](
        q: var Limbs[qLen],
@@ -75,11 +76,9 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
   ## - r must be of size vLen (assuming v uses all words)
   ## - uLen >= vLen
   ##
-  ## Knuth Division
-  ## - Knuth's "Algorithm D", The Art of Computer Programming, 1998
-  ## - Warren, Hacker's Delight, 2013
-  ##
   ## For now only LittleEndian is implemented
+  #
+  # Resources at the bottom of the file
 
   # Find the most significant word with actual set bits
   # and get the leading zero count there
@@ -92,9 +91,10 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
     else:
       divisorLen -= 1
 
-  doAssert msw != 0, "Division by zero. Abandon ship!"
+  doAssert divisorLen != 0, "Division by zero. Abandon ship!"
 
-  if mswLen == 1:
+  # Divisor is a single word.
+  if divisorLen == 1:
     q.copyFrom(u)
     r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord())
     # zero all but the least significant word
@@ -114,23 +114,64 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
   un.shlSmallOverflowing(u, clz)
   vn.shlSmall(v, clz)
 
-  static: doAssert cpuEndian == littleEndian, "As it is the division algorithm requires little endian ordering of the limbs".
+  static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
   # TODO: is it worth it to have the uint be the exact same extended precision representation
   # as a wide int (say uint128 or uint256)?
   # in big-endian, the following loop must go the other way and the -1 must be +1
+
+  let vhi = vn[divisorLen-1]
+  let vlo = vn[divisorLen-2]
+
   for j in countdown(uLen - divisorLen, 0, 1):
     # Compute qhat estimate of q[j] (off by 0, 1 and rarely 2)
     var qhat, rhat: Word
-    let hi = un[j+divisorLen]
-    let lo = un[j+divisorLen-1]
-    div2n1n(qhat, rhat, hi, lo, vn[divisorLen-1])
+    let uhi = un[j+divisorLen]
+    let ulo = un[j+divisorLen-1]
+    div2n1n(qhat, rhat, uhi, ulo, vhi)
+    var mhi, mlo: Word
+    var rhi, rlo: Word
+    mul(mhi, mlo, qhat, vlo)
+    rhi = rhat
+    rlo = ulo
+
+    # if r < m, adjust approximation, up to twice
+    while rhi < mhi or (rhi == mhi and rlo < mlo):
+      qhat -= 1
+      rhi += vhi
 
+    # Found the quotient
+    q[j] = qhat
+
+    # un -= qhat * v
+    var borrow = Borrow(0)
+    var qvhi, qvlo: Word
+    for i in 0 ..< divisorLen-1:
+      mul(qvhi, qvlo, qhat, v[i])
+      subB(borrow, un[j+i], un[j+i], qvlo, borrow)
+      subB(borrow, un[j+i+1], un[j+i+1], qvhi, borrow)
+    # Last step
+    mul(qvhi, qvlo, qhat, v[divisorLen-1])
+    subB(borrow, un[j+divisorLen-1], un[j+divisorLen-1], qvlo, borrow)
+    qvhi += Word(borrow)
+    let isNeg = un[j+divisorLen] < qvhi
+    un[j+divisorLen] -= qvhi
+
+    if isNeg:
+      # oops, too big by one, add back
+      q[j] -= 1
+      var carry = Carry(0)
+      for i in 0 ..< divisorLen:
+        addC(carry, u[j+i], u[j+i], v[i], carry)
+
+  # Quotient is found, if remainder is needed we need to un-normalize un
+  if needRemainder:
+    r.shrSmall(un, clz)
 
 const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
                                 # binary shift is probably faster
 
-func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]=
-
+func divmod(q, r: var Stuint,
+    x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
   let x_clz = x.leadingZeros()
   let y_clz = y.leadingZeros()
 
@@ -139,30 +180,99 @@ func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]=
     raise newException(DivByZeroDefect, "You attempted to divide by zero")
   elif y_clz == (bitsof(y) - 1):
     # y is one
-    result.quot = x
-  elif (x.hi or y.hi).isZero:
-    # If computing just on the low part is enough
-    (result.quot.lo, result.rem.lo) = divmod(x.lo, y.lo)
-  elif (y and (y - one(type y))).isZero:
-    # y is a power of 2. (this also matches 0 but it was eliminated earlier)
-    # TODO. Would it be faster to use countTrailingZero (ctz) + clz == size(y) - 1?
-    #       Especially because we shift by ctz after.
-    let y_ctz = bitsof(y) - y_clz - 1
-    result.quot = x shr y_ctz
-    result.rem = x and (y - one(type y))
+    q = x
+  # elif (x.hi or y.hi).isZero:
+  #   # If computing just on the low part is enough
+  #   (result.quot.lo, result.rem.lo) = divmod(x.lo, y.lo, needRemainder)
+  # elif (y and (y - one(type y))).isZero:
+  #   # y is a power of 2. (this also matches 0 but it was eliminated earlier)
+  #   # TODO. Would it be faster to use countTrailingZero (ctz) + clz == size(y) - 1?
+  #   #       Especially because we shift by ctz after.
+  #   let y_ctz = bitsof(y) - y_clz - 1
+  #   result.quot = x shr y_ctz
+  #   if needRemainder:
+  #     result.rem = x and (y - one(type y))
   elif x == y:
-    result.quot.lo = one(T)
+    q.setOne()
   elif x < y:
-    result.rem = x
-  elif (y_clz - x_clz) < BinaryShiftThreshold:
-    binaryShiftDiv(x, y, result.quot, result.rem)
+    r = x
+  # elif (y_clz - x_clz) < BinaryShiftThreshold:
+  #   binaryShiftDiv(x, y, result.quot, result.rem)
   else:
-    divmodBZ(x, y, result.quot, result.rem)
+    knuthDivLE(q, r, x, y, needRemainder)
 
-func `div`*(x, y: UintImpl): UintImpl {.inline.} =
+func `div`*(x, y: Stuint): Stuint {.inline.} =
   ## Division operation for multi-precision unsigned uint
-  divmod(x,y).quot
+  var tmp{.noInit.}: Stuint
+  divmod(result, tmp, x, y, needRemainder = false)
 
-func `mod`*(x, y: UintImpl): UintImpl {.inline.} =
-  ## Division operation for multi-precision unsigned uint
-  divmod(x,y).rem
+func `mod`*(x, y: Stuint): Stuint {.inline.} =
+  ## Remainder operation for multi-precision unsigned uint
+  var tmp{.noInit.}: Stuint
+  divmod(tmp, result, x,y, needRemainder = true)
+
+func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
+  ## Division and remainder operations for multi-precision unsigned uint
+  divmod(result.quot, result.rem, x, y, needRemainder = true)
+
+# ######################################################################
+# Division implementations
+#
+# Multi-precision division is a costly
+#and also difficult to implement operation
+
+# ##### Research #####
+
+# Overview of division algorithms:
+# - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms
+# - https://gmplib.org/~tege/division-paper.pdf
+# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Lec5-Fast-Division-Hasselstrom2003.pdf
+
+# Schoolbook / Knuth Division (Algorithm D)
+# - https://skanthak.homepage.t-online.de/division.html
+#   Review of implementation flaws
+# - Hacker's Delight https://github.com/hcs0/Hackers-Delight/blob/master/divmnu64.c.txt
+# - LLVM: https://github.com/llvm-mirror/llvm/blob/2c4ca68/lib/Support/APInt.cpp#L1289-L1451
+# - ctbignum: https://github.com/niekbouman/ctbignum/blob/v0.5/include/ctbignum/division.hpp
+# - Modern Computer Arithmetic - https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+#   p14 - 1.4.1 Naive Division
+# - Handbook of Applied Cryptography - https://cacr.uwaterloo.ca/hac/about/chap14.pdf
+#   Chapter 14 algorithm 14.2.5
+
+# Smith Method (and derivatives)
+# This method improves Knuth algorithm by ~3x by removing regular normalization
+# - A Multiple-Precision Division Algorithm, David M Smith
+#   American mathematical Society, 1996
+#   https://www.ams.org/journals/mcom/1996-65-213/S0025-5718-96-00688-6/S0025-5718-96-00688-6.pdf
+#
+# - An Efficient Multiple-Precision Division Algorithm,
+#   Liusheng Huang, Hong Zhong, Hong Shen, Yonglong Luo, 2005
+#   https://ieeexplore.ieee.org/document/1579076
+#
+# - Efficient multiple-precision integer division algorithm
+#   Debapriyay Mukhopadhyaya, Subhas C.Nandy, 2014
+#   https://www.sciencedirect.com/science/article/abs/pii/S0020019013002627
+
+# Recursive division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
+# - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
+# - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
+# - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
+# - Modern Computer Arithmetic - https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+#   p18 - 1.4.3 Divide and Conquer Division
+
+# Newton Raphson Iterations
+# - Putty (constant-time): https://github.com/github/putty/blob/0.74/mpint.c#L1818-L2112
+# - Modern Computer Arithmetic - https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+#   p18 - 1.4.3 Divide and Conquer Division
+
+# Other libraries that can be used as reference for alternative (?) implementations:
+# - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383
+# - LibTomMath: https://github.com/libtom/libtommath
+# - Google Abseil for uint128: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric
+# Note: GMP/MPFR are GPL. The papers can be used but not their code.
+
+# Related research
+# - Efficient divide-and-conquer multiprecision integer division
+#   William Hart, IEEE 2015
+#   https://github.com/wbhart/bsdnt
+#   https://ieeexplore.ieee.org/document/7203801
\ No newline at end of file
diff --git a/stint/uintops.nim b/stint/uintops.nim
index e89987c..738a1b9 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -13,6 +13,8 @@ import
   ./private/uint_bitwise,
   ./private/uint_shift,
   ./private/uint_addsub,
+  ./private/uint_mul,
+  ./private/uint_div,
   ./private/primitives/addcarry_subborrow
 
 export StUint
@@ -171,7 +173,6 @@ export `+=`
 # - It's implemented at the limb-level so that
 #   in the future Stuint[254] and Stuint256] share a common codepath
 
-import ./private/uint_mul
 {.push raises: [], inline, noInit, gcsafe.}
 
 func `*`*(a, b: Stuint): Stuint =
@@ -227,3 +228,5 @@ func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
 
 # Division & Modulo
 # --------------------------------------------------------
+
+export uint_div
\ No newline at end of file

From 53d2fd14f3cd34d9b92bb9593d4ddf7396bc422a Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sat, 22 Jan 2022 01:42:54 +0100
Subject: [PATCH 17/26] uint division - compile and pass the single limb tests

---
 stint/private/datatypes.nim                   |  6 +-
 .../primitives/compiletime_fallback.nim       | 49 ++++++++++++-
 .../private/primitives/extended_precision.nim | 69 ++++++++++++++-----
 .../extended_precision_64bit_uint128.nim      |  8 +--
 .../extended_precision_x86_64_gcc.nim         |  2 +-
 .../extended_precision_x86_64_msvc.nim        | 16 +----
 stint/private/uint_div.nim                    | 49 ++++++++++---
 tests/test_uint_divmod.nim                    | 28 ++++----
 8 files changed, 163 insertions(+), 64 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 0e87408..39947fa 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -180,9 +180,9 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
 # Copy
 # --------------------------------------------------------
 
-func copyFrom*[dLen, sLen](
-        dst: var SomeBigInteger[dLen],
-        src: SomeBigInteger[sLen]
+func copyFrom*(
+        dst: var SomeBigInteger,
+        src: SomeBigInteger
       ){.inline.} =
   ## Copy a BigInteger, truncated to 2^slen if the source
   ## is larger than the destination
diff --git a/stint/private/primitives/compiletime_fallback.nim b/stint/private/primitives/compiletime_fallback.nim
index 92580d9..051cf86 100644
--- a/stint/private/primitives/compiletime_fallback.nim
+++ b/stint/private/primitives/compiletime_fallback.nim
@@ -80,7 +80,7 @@ func mul_nim*(hi, lo: var uint64, u, v: uint64) =
   hi = x3 + hi(x1)
   lo = merge(x1, lo(x0))
 
-func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+func muladd1_nim*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
   ## Extended precision multiplication + addition
   ## (hi, lo) <- a*b + c
   ##
@@ -91,7 +91,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
   addC_nim(carry, lo, lo, c, 0)
   addC_nim(carry, hi, hi, 0, carry)
 
-func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
   ## Extended precision multiplication + addition + addition
   ## (hi, lo) <- a*b + c1 + c2
   ##
@@ -107,3 +107,48 @@ func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
   # Carry chain 2
   addC_nim(carry2, lo, lo, c2, 0)
   addC_nim(carry2, hi, hi, 0, carry2)
+
+
+func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # doAssert leadingZeros(d) == 0, "Divisor was not normalized"
+
+  const
+    size = sizeof(q) * 8
+    halfSize = size div 2
+    halfMask = (1.T shl halfSize) - 1.T
+
+  template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
+
+    var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
+    let m = q * d_lo
+    r = (r shl halfSize) or n_lo
+
+    # Fix the reminder, we're at most 2 iterations off
+    if r < m:
+      dec q
+      r += d
+      if r >= d and r < m:
+        dec q
+        r += d
+    r -= m
+    (q, r)
+
+  let
+    d_hi = d shr halfSize
+    d_lo = d and halfMask
+    n_lohi = nlo shr halfSize
+    n_lolo = nlo and halfMask
+
+  # First half of the quotient
+  let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
+
+  # Second half
+  let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
+
+  q = (q1 shl halfSize) or q2
+  r = r2
\ No newline at end of file
diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim
index b666786..9d795fd 100644
--- a/stint/private/primitives/extended_precision.nim
+++ b/stint/private/primitives/extended_precision.nim
@@ -73,19 +73,57 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
 # ############################################################
 
 when sizeof(int) == 8 and not defined(Stint32):
-  when nimvm:
-    from ./compiletime_fallback import mul_nim, muladd1, muladd2
-  else:
-    when defined(vcc):
-      from ./extended_precision_x86_64_msvc import div2n1n, mul, muladd1, muladd2
-    elif GCCCompatible:
-      when X86:
-        from ./extended_precision_x86_64_gcc import div2n1n
-        from ./extended_precision_64bit_uint128 import mul, muladd1, muladd2
-      else:
-        from ./extended_precision_64bit_uint128 import div2n1n, mul, muladd1, muladd2
-    export div2n1n, mul
-  export muladd1, muladd2
+  from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim
+
+  when defined(vcc):
+    from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
+  elif GCCCompatible:
+    when X86:
+      from ./extended_precision_x86_64_gcc import div2n1n_128
+      from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
+    else:
+      from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128
+
+  func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
+    ## Extended precision multiplication
+    ## (hi, lo) <- u * v
+    when nimvm:
+      mul_nim(hi, lo, u, v)
+    else:
+      mul_128(hi, lo, u, v)
+
+  func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
+    ## Extended precision multiplication + addition
+    ## (hi, lo) <- a*b + c
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding any c cannot overflow
+    when nimvm:
+      muladd1_nim(hi, lo, a, b, c)
+    else:
+      muladd1_128(hi, lo, a, b, c)
+
+  func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+    ## Extended precision multiplication + addition + addition
+    ## (hi, lo) <- a*b + c1 + c2
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+    ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+    when nimvm:
+      muladd2_nim(hi, lo, a, b, c1, c2)
+    else:
+      muladd2_128(hi, lo, a, b, c1, c2)
+
+  func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    when nimvm:
+      div2n1n_nim(q, r, n_hi, n_lo, d)
+    else:
+      div2n1n_128(q, r, n_hi, n_lo, d)
 
 # ############################################################
 #
@@ -128,10 +166,7 @@ func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
   ## (t, u, v) <- (t, u, v) + a * b
   var UV: array[2, T]
   var carry: Carry
-  when nimvm:
-    mul_nim(UV[1], UV[0], a, b)
-  else:
-    mul(UV[1], UV[0], a, b)
+  mul(UV[1], UV[0], a, b)
   addC(carry, v, v, UV[0], Carry(0))
   addC(carry, u, u, UV[1], carry)
   t += T(carry)
diff --git a/stint/private/primitives/extended_precision_64bit_uint128.nim b/stint/private/primitives/extended_precision_64bit_uint128.nim
index 7861427..321e4da 100644
--- a/stint/private/primitives/extended_precision_64bit_uint128.nim
+++ b/stint/private/primitives/extended_precision_64bit_uint128.nim
@@ -19,7 +19,7 @@ static:
   doAssert GCC_Compatible
   doAssert sizeof(int) == 8
 
-func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
   ## Division uint128 by uint64
   ## Warning ⚠️ :
   ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
@@ -35,7 +35,7 @@ func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
     {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
     {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
 
-func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
+func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
   ## Extended precision multiplication
   ## (hi, lo) <- a*b
   block:
@@ -50,7 +50,7 @@ func mul*(hi, lo: var uint64, a, b: uint64) {.inline.} =
       {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
       {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
 
-func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
   ## Extended precision multiplication + addition
   ## (hi, lo) <- a*b + c
   ##
@@ -71,7 +71,7 @@ func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
       {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
       {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
 
-func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
   ## Extended precision multiplication + addition + addition
   ## This is constant-time on most hardware except some specific one like Cortex M0
   ## (hi, lo) <- a*b + c1 + c2
diff --git a/stint/private/primitives/extended_precision_x86_64_gcc.nim b/stint/private/primitives/extended_precision_x86_64_gcc.nim
index 0e18c7f..9c7b7ab 100644
--- a/stint/private/primitives/extended_precision_x86_64_gcc.nim
+++ b/stint/private/primitives/extended_precision_x86_64_gcc.nim
@@ -20,7 +20,7 @@ static:
   doAssert sizeof(int) == 8
   doAssert X86
 
-func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
   ## Division uint128 by uint64
   ## Warning ⚠️ :
   ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
diff --git a/stint/private/primitives/extended_precision_x86_64_msvc.nim b/stint/private/primitives/extended_precision_x86_64_msvc.nim
index 9adcd32..3a66457 100644
--- a/stint/private/primitives/extended_precision_x86_64_msvc.nim
+++ b/stint/private/primitives/extended_precision_x86_64_msvc.nim
@@ -38,35 +38,25 @@ func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
     ## Warning ⚠️ :
     ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
     ##   - if n_hi > d result is undefined
-    {.warning: "unsafeDiv2n1n is not constant-time at the moment on most hardware".}
-
-    # TODO !!! - Replace by constant-time, portable, non-assembly version
-    #          -> use uint128? Compiler might add unwanted branches
     q = udiv128(n_hi, n_lo, d, r)
 
-func mul*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
   ## Extended precision multiplication
   ## (hi, lo) <- a*b
-  ##
-  ## This is constant-time on most hardware
-  ## See: https://www.bearssl.org/ctmul.html
   lo = umul128(a, b, hi)
 
-func muladd1*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
+func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
   ## Extended precision multiplication + addition
   ## (hi, lo) <- a*b + c
   ##
   ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
   ##       so adding any c cannot overflow
-  ##
-  ## This is constant-time on most hardware
-  ## See: https://www.bearssl.org/ctmul.html
   var carry: Carry
   lo = umul128(a, b, hi)
   addC(carry, lo, lo, c, Carry(0))
   addC(carry, hi, hi, 0, carry)
 
-func muladd2*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
+func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
   ## Extended precision multiplication + addition + addition
   ## This is constant-time on most hardware except some specific one like Cortex M0
   ## (hi, lo) <- a*b + c1 + c2
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index e3bfaed..aaa8606 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -63,11 +63,11 @@ func shortDiv*(a: var Limbs, k: Word): Word =
 #     d = d shr 1
 #     dec(shift)
 
-func knuthDivLE[qLen, rLen, uLen, vLen: static int](
-       q: var Limbs[qLen],
-       r: var Limbs[rLen],
-       u: Limbs[uLen],
-       v: Limbs[vLen],
+func knuthDivLE(
+       q: var StUint,
+       r: var StUint,
+       u: StUint,
+       v: StUint,
        needRemainder: bool) =
   ## Compute the quotient and remainder (if needed)
   ## of the division of u by v
@@ -80,6 +80,15 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
   #
   # Resources at the bottom of the file
 
+  const
+    qLen = q.limbs.len
+    rLen = r.limbs.len
+    uLen = u.limbs.len
+    vLen = v.limbs.len
+
+  template `[]`(a: Stuint, i: int): Word = a.limbs[i]
+  template `[]=`(a: Stuint, i: int, val: Word) = a.limbs[i] = val
+
   # Find the most significant word with actual set bits
   # and get the leading zero count there
   var divisorLen = vLen
@@ -96,7 +105,7 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
   # Divisor is a single word.
   if divisorLen == 1:
     q.copyFrom(u)
-    r.leastSignificantWord() = q.shortDiv(v.leastSignificantWord())
+    r.leastSignificantWord() = q.limbs.shortDiv(v.leastSignificantWord())
     # zero all but the least significant word
     var lsw = true
     for w in leastToMostSig(r):
@@ -111,8 +120,8 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
 
   # Normalize so that the divisor MSB is set,
   # vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
-  un.shlSmallOverflowing(u, clz)
-  vn.shlSmall(v, clz)
+  un.shlSmallOverflowing(u.limbs, clz)
+  vn.shlSmall(v.limbs, clz)
 
   static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
   # TODO: is it worth it to have the uint be the exact same extended precision representation
@@ -161,24 +170,42 @@ func knuthDivLE[qLen, rLen, uLen, vLen: static int](
       q[j] -= 1
       var carry = Carry(0)
       for i in 0 ..< divisorLen:
-        addC(carry, u[j+i], u[j+i], v[i], carry)
+        addC(carry, un[j+i], un[j+i], v[i], carry)
 
   # Quotient is found, if remainder is needed we need to un-normalize un
   if needRemainder:
-    r.shrSmall(un, clz)
+    # r.limbs.shrSmall(un, clz) - TODO
+    when cpuEndian == littleEndian:
+      # rLen+1 == un.len
+      for i in 0 ..< rLen:
+        r[i] = (un[i] shr clz) or (un[i+1] shl (WordBitWidth - clz))
+    else:
+      {.error: "Not Implemented for bigEndian".}
+
 
 const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
                                 # binary shift is probably faster
 
 func divmod(q, r: var Stuint,
+<<<<<<< HEAD
     x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
+=======
+            x, y: Stuint, needRemainder: bool) =
+
+>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
   let x_clz = x.leadingZeros()
   let y_clz = y.leadingZeros()
 
   # We short-circuit division depending on special-cases.
+<<<<<<< HEAD
   if unlikely(y.isZero):
     raise newException(DivByZeroDefect, "You attempted to divide by zero")
   elif y_clz == (bitsof(y) - 1):
+=======
+  if unlikely(y.isZero()):
+    raise newException(DivByZeroError, "You attempted to divide by zero")
+  elif y_clz == (y.bits - 1):
+>>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
     # y is one
     q = x
   # elif (x.hi or y.hi).isZero:
@@ -209,7 +236,7 @@ func `div`*(x, y: Stuint): Stuint {.inline.} =
 func `mod`*(x, y: Stuint): Stuint {.inline.} =
   ## Remainder operation for multi-precision unsigned uint
   var tmp{.noInit.}: Stuint
-  divmod(tmp, result, x,y, needRemainder = true)
+  divmod(tmp, result, x, y, needRemainder = true)
 
 func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
   ## Division and remainder operations for multi-precision unsigned uint
diff --git a/tests/test_uint_divmod.nim b/tests/test_uint_divmod.nim
index b210996..0cb5002 100644
--- a/tests/test_uint_divmod.nim
+++ b/tests/test_uint_divmod.nim
@@ -190,19 +190,21 @@ suite "Testing unsigned int division and modulo implementation":
     check: cast[uint64](qr.quot) == 7'u64
     check: cast[uint64](qr.rem)  == 9'u64
 
-  test "Divmod(2^64, 3) returns the correct result":
-    let a = 1.stuint(128) shl 64
-    let b = 3.stuint(128)
-
-    let qr = divmod(a, b)
-
-    let q = cast[UintImpl[uint64]](qr.quot)
-    let r = cast[UintImpl[uint64]](qr.rem)
-
-    check: q.lo == 6148914691236517205'u64
-    check: q.hi == 0'u64
-    check: r.lo == 1'u64
-    check: r.hi == 0'u64
+  # TODO - no more .lo / .hi
+  #
+  # test "Divmod(2^64, 3) returns the correct result":
+  #   let a = 1.stuint(128) shl 64
+  #   let b = 3.stuint(128)
+  #
+  #   let qr = divmod(a, b)
+  #
+  #   let q = cast[UintImpl[uint64]](qr.quot)
+  #   let r = cast[UintImpl[uint64]](qr.rem)
+  #
+  #   check: q.lo == 6148914691236517205'u64
+  #   check: q.hi == 0'u64
+  #   check: r.lo == 1'u64
+  #   check: r.hi == 0'u64
 
   test "Divmod(1234567891234567890, 10) returns the correct result":
     let a = cast[StUint[64]](1234567891234567890'u64)

From 7efa2483e4083f6357e3ce3f878e10da49695042 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sun, 23 Jan 2022 21:39:26 +0100
Subject: [PATCH 18/26] Division/modulo implemented - pass property-based
 testing vs ttmath

---
 benchmarks/bench_mod.nim      |  18 +--
 stint/private/uint_addsub.nim |   2 -
 stint/private/uint_div.nim    | 228 ++++++++++++++++++++++++++--------
 stint/private/uint_shift.nim  |  41 +++---
 stint/uintops.nim             |   2 +-
 5 files changed, 204 insertions(+), 87 deletions(-)

diff --git a/benchmarks/bench_mod.nim b/benchmarks/bench_mod.nim
index 0f79b68..57fabb7 100644
--- a/benchmarks/bench_mod.nim
+++ b/benchmarks/bench_mod.nim
@@ -15,29 +15,33 @@ echo "Warmup: " & $(stop - start) & "s"
 
 ####################################
 
+let a = [123'u64, 123'u64, 123'u64, 123'u64]
+let m = [456'u64, 456'u64, 456'u64, 45'u64]
+
+let aU256 = cast[Stuint[256]](a)
+let mU256 = cast[Stuint[256]](m)
 
 start = cpuTime()
 block:
-  var foo = 123.u256
+  var foo = aU256
   for i in 0 ..< 10_000_000:
-    foo += i.u256 * i.u256 mod 456.u256
-    foo = foo mod 789.u256
+    foo += (foo * foo) mod mU256
 
 stop = cpuTime()
 echo "Library: " & $(stop - start) & "s"
 
 when defined(bench_ttmath):
   # need C++
-  import ttmath
+  import ttmath, ../tests/ttmath_compat
 
   template tt_u256(a: int): UInt[256] = ttmath.u256(a.uint)
 
   start = cpuTime()
   block:
-    var foo = 123.tt_u256
+    var foo = a.astt()
+    let mU256 = m.astt()
     for i in 0 ..< 10_000_000:
-      foo += i.tt_u256 * i.tt_u256 mod 456.tt_u256
-      foo = foo mod 789.tt_u256
+      foo += (foo * foo) mod mU256
 
   stop = cpuTime()
   echo "TTMath: " & $(stop - start) & "s"
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index f037c1b..6821d75 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -8,8 +8,6 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  # Status lib
-  stew/bitops2,
   # Internal
   ./datatypes,
   ./primitives/addcarry_subborrow
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index aaa8606..333234c 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -13,9 +13,33 @@ import
   # Internal
   ./datatypes,
   ./uint_bitwise,
-  ./uint_shift,
   ./primitives/[addcarry_subborrow, extended_precision]
 
+# Helpers
+# --------------------------------------------------------
+
+func usedBitsAndWords(a: openArray[Word]): tuple[bits, words: int] {.inline.} =
+  ## Returns the number of used words and bits in a bigInt
+  var clz = 0
+  # Count Leading Zeros
+  for i in countdown(a.len-1, 0):
+    let count = log2trunc(a[i])
+    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
+    if count == -1:
+      clz += WordBitWidth
+    else:
+      clz += WordBitWidth - count - 1
+      return (a.len*WordBitWidth - clz, i+1)
+
+func copyWords(
+       a: var openArray[Word], startA: int,
+       b: openArray[Word], startB: int,
+       numWords: int) =
+  ## Copy a slice of B into A. This properly deals
+  ## with overlaps when A and B are slices of the same buffer
+  for i in countdown(numWords-1, 0):
+    a[startA+i] = b[startB+i]
+
 # Division
 # --------------------------------------------------------
 
@@ -37,45 +61,18 @@ func shortDiv*(a: var Limbs, k: Word): Word =
     # Undo normalization
     result = result shr clz
 
-# func binaryShiftDiv[qLen, rLen, uLen, vLen: static int](
-#        q: var Limbs[qLen],
-#        r: var Limbs[rLen],
-#        u: Limbs[uLen],
-#        v: Limbs[vLen]) =
-#   ## Division for multi-precision unsigned uint
-#   ## Implementation through binary shift division
-#   doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
-
-#   type SubTy = type x.lo
-
-#   var
-#     shift = y.leadingZeros - x.leadingZeros
-#     d = y shl shift
-
-#   r = x
-
-#   while shift >= 0:
-#     q += q
-#     if r >= d:
-#       r -= d
-#       q.lo = q.lo or one(SubTy)
-
-#     d = d shr 1
-#     dec(shift)
-
-func knuthDivLE(
-       q: var StUint,
-       r: var StUint,
-       u: StUint,
-       v: StUint,
-       needRemainder: bool) =
-  ## Compute the quotient and remainder (if needed)
-  ## of the division of u by v
+func shlAddMod_multi(a: var openArray[Word], c: Word,
+                     M: openArray[Word], mBits: int): Word =
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ## 
+  ## Specialized for M being a multi-precision integer.
   ##
-  ## - q must be of size uLen - vLen + 1 (assuming u and v uses all words)
-  ## - r must be of size vLen (assuming v uses all words)
-  ## - uLen >= vLen
+  ## With a word W = 2^WordBitWidth and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ## and returns q = (a * W + c ) / M
   ##
+<<<<<<< HEAD
   ## For now only LittleEndian is implemented
   #
   # Resources at the bottom of the file
@@ -187,25 +184,15 @@ const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
                                 # binary shift is probably faster
 
 func divmod(q, r: var Stuint,
-<<<<<<< HEAD
-    x: Limbs[xLen], y: Limbs[yLen], needRemainder: bool) =
-=======
             x, y: Stuint, needRemainder: bool) =
 
->>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
   let x_clz = x.leadingZeros()
   let y_clz = y.leadingZeros()
 
   # We short-circuit division depending on special-cases.
-<<<<<<< HEAD
-  if unlikely(y.isZero):
-    raise newException(DivByZeroDefect, "You attempted to divide by zero")
-  elif y_clz == (bitsof(y) - 1):
-=======
   if unlikely(y.isZero()):
     raise newException(DivByZeroError, "You attempted to divide by zero")
   elif y_clz == (y.bits - 1):
->>>>>>> 88858a7 (uint division - compile and pass the single limb tests)
     # y is one
     q = x
   # elif (x.hi or y.hi).isZero:
@@ -225,28 +212,163 @@ func divmod(q, r: var Stuint,
     r = x
   # elif (y_clz - x_clz) < BinaryShiftThreshold:
   #   binaryShiftDiv(x, y, result.quot, result.rem)
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  
+                                        # Assuming 64-bit words
+  let hi = a[^1]                        # Save the high word to detect carries
+  let R = mBits and (WordBitWidth - 1)  # R = mBits mod 64
+
+  var a0, a1, m0: Word
+  if R == 0:                            # If the number of mBits is a multiple of 64
+    a0 = a[^1]                          #
+    copyWords(a, 1, a, 0, a.len-1)      # we can just shift words
+    a[0] = c                            # and replace the first one by c
+    a1 = a[^1]
+    m0 = M[^1]
+  else:                                 # Else: need to deal with partial word shifts at the edge.
+    let clz = WordBitWidth-R
+    a0 = (a[^1] shl clz) or (a[^2] shr R)
+    copyWords(a, 1, a, 0, a.len-1)
+    a[0] = c
+    a1 = (a[^1] shl clz) or (a[^2] shr R)
+    m0 = (M[^1] shl clz) or (M[^2] shr R)
+
+  # m0 has its high bit set. (a0, a1)/m0 fits in a limb.
+  # Get a quotient q, at most we will be 2 iterations off
+  # from the true quotient
+  var q: Word                           # Estimate quotient
+  if a0 == m0:                          # if a_hi == divisor
+    q = high(Word)                      # quotient = MaxWord (0b1111...1111)
+  elif a0 == 0 and a1 < m0:             # elif q == 0, true quotient = 0
+    q = 0
+  else:
+    var r: Word
+    div2n1n(q, r, a0, a1, m0)           # else instead of being of by 0, 1 or 2
+    q -= 1                              # we return q-1 to be off by -1, 0 or 1
+
+  # Now substract a*2^64 - q*m
+  var carry = Word(0)
+  var overM = true                      # Track if quotient greater than the modulus
+
+  for i in 0 ..< M.len:
+    var qm_lo: Word
+    block:                              # q*m
+      # q * p + carry (doubleword) carry from previous limb
+      muladd1(carry, qm_lo, q, M[i], carry)
+
+    block:                              # a*2^64 - q*m
+      var borrow: Borrow
+      subB(borrow, a[i], a[i], qm_lo, Borrow(0))
+      carry += Word(borrow) # Adjust if borrow
+
+    if a[i] != M[i]:
+      overM = a[i] > M[i]
+
+  # Fix quotient, the true quotient is either q-1, q or q+1
+  #
+  # if carry < q or carry == q and overM we must do "a -= M"
+  # if carry > hi (negative result) we must do "a += M"
+  if carry > hi:
+    var c = Carry(0)
+    for i in 0 ..< a.len:
+      addC(c, a[i], a[i], M[i], c)
+    q -= 1
+  elif overM or (carry < hi):
+    var b = Borrow(0)
+    for i in 0 ..< a.len:
+      subB(b, a[i], a[i], M[i], b)
+    q += 1
+
+  return q
+
+func shlAddMod(a: var openArray[Word], c: Word,
+               M: openArray[Word], mBits: int): Word {.inline.}=
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ## 
+  ## With a word W = 2^WordBitWidth and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ## and returns q = (a * W + c ) / M
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  if mBits <= WordBitWidth:
+    # If M fits in a single limb
+
+    # We normalize M with clz so that the MSB is set
+    # And normalize (a * 2^64 + c) by R as well to maintain the result
+    # This ensures that (a0, a1)/p0 fits in a limb.
+    let R = mBits and (WordBitWidth - 1)
+    let clz = WordBitWidth-R
+
+    # (hi, lo) = a * 2^64 + c
+    let hi = (a[0] shl clz) or (c shr R)
+    let lo = c shl clz
+    let m0 = M[0] shl clz
+
+    var q, r: Word
+    div2n1n(q, r, hi, lo, m0)
+    a[0] = r shr clz
+    return q
+  else:
+    return shlAddMod_multi(a, c, M, mBits)
+
+func divRemImpl(
+       q, r: var openArray[Word],
+       a, b: openArray[Word]
+     ) =
+  let (aBits, aLen) = usedBitsAndWords(a)
+  let (bBits, bLen) = usedBitsAndWords(b)
+  let rLen = bLen
+
+  if aBits < bBits:
+    # if a uses less bits than b,
+    # a < b, so q = 0 and r = a
+    copyWords(r, 0, a, 0, aLen)
+    for i in aLen ..< r.len: # r.len >= rLen
+      r[i] = 0
+    for i in 0 ..< q.len:
+      q[i] = 0
   else:
-    knuthDivLE(q, r, x, y, needRemainder)
+    # The length of a is at least the divisor
+    # We can copy bLen-1 words
+    # and modular shift-lef-add the rest
+    let aOffset = aLen - bLen
+    copyWords(r, 0, a, aOffset+1, bLen-1)
+    r[rLen-1] = 0
+    # Now shift-left the copied words while adding the new word mod b
+    for i in countdown(aOffset, 0):
+      q[i] = shlAddMod(
+        r.toOpenArray(0, rLen-1),
+        a[i],
+        b.toOpenArray(0, bLen-1),
+        bBits
+      )
+
+    # Clean up extra words
+    for i in aOffset+1 ..< q.len:
+      q[i] = 0
+    for i in rLen ..< r.len:
+      r[i] = 0
 
 func `div`*(x, y: Stuint): Stuint {.inline.} =
   ## Division operation for multi-precision unsigned uint
   var tmp{.noInit.}: Stuint
-  divmod(result, tmp, x, y, needRemainder = false)
+  divRemImpl(result.limbs, tmp.limbs, x.limbs, y.limbs)
 
 func `mod`*(x, y: Stuint): Stuint {.inline.} =
   ## Remainder operation for multi-precision unsigned uint
   var tmp{.noInit.}: Stuint
-  divmod(tmp, result, x, y, needRemainder = true)
+  divRemImpl(tmp.limbs, result.limbs, x.limbs, y.limbs)
 
 func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
   ## Division and remainder operations for multi-precision unsigned uint
-  divmod(result.quot, result.rem, x, y, needRemainder = true)
+  divRemImpl(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
 
 # ######################################################################
 # Division implementations
 #
 # Multi-precision division is a costly
-#and also difficult to implement operation
+# and also difficult to implement operation
 
 # ##### Research #####
 
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index a1441b0..3d50abf 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -54,33 +54,14 @@ func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   when cpuEndian == littleEndian:
     for i in 0 ..< Limbs.len-w:
       r[i] = a[i+w]
+    for i in Limbs.len-w ..< Limbs.len:
+      r[i] = 0
   else:
+    for i in countdown(Limbs.len-1, Limbs.len-w):
+      r[i] = 0
     for i in countdown(Limbs.len-w, 0):
       r[i] = a[i+w]
 
-func shlSmallOverflowing*[rLen, aLen: static int](
-       r: var Limbs[rLen], a: Limbs[aLen], k: SomeInteger) =
-  ## Compute the `shift left` operation of x and k
-  ##
-  ## k MUST be less than the base word size (2^32 or 2^64)
-  when cpuEndian == littleEndian:
-    r[0] = a[0] shl k
-    for i in 1 ..< a.len:
-      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
-    if rLen > aLen:
-      r[aLen] = a[aLen - 1] shr (WordBitWidth - k)
-      for i in aLen+1 ..< rLen:
-        r[i] = 0
-  else:
-    const offset = rLen - aLen
-    r[^1] = a[^1] shl k
-    for i in countdown(a.len-2, 0):
-      r[i+offset] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
-    if rLen > aLen:
-      r[offset-1] = a[0] shr (WordBitWidth - k)
-      for i in 0 ..< offset-1:
-        r[i] = 0
-
 func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   ## Compute the `shift left` operation of x and k
   ##
@@ -112,10 +93,14 @@ func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
 func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   ## Shift left by w word
   when cpuEndian == littleEndian:
+    for i in 0 ..< w:
+      r[i] = 0
     for i in 0 ..< Limbs.len-w:
       r[i+w] = a[i]
   else:
-    for i in countdown(Limbs.len-1, 0):
+    for i in countdown(Limbs.len-1, Limbs.len-w):
+      r[i] = 0
+    for i in countdown(Limbs.len-w-1, 0):
       r[i] = a[i-w]
 
 # Wrappers
@@ -123,6 +108,10 @@ func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
 
 func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
   ## Shift `a` right by k bits and store in `r`
+  if k == 0:
+    r = a
+    return
+  
   if k < WordBitWidth:
     r.limbs.shrSmall(a.limbs, k)
     return
@@ -138,6 +127,10 @@ func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
 
 func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
   ## Shift `a` left by k bits and store in `r`
+  if k == 0:
+    r = a
+    return
+  
   if k < WordBitWidth:
     r.limbs.shlSmall(a.limbs, k)
     r.clearExtraBits()
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 738a1b9..f196bd2 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -26,7 +26,7 @@ export StUint
 func setZero*(a: var StUint) =
   ## Set ``a`` to 0
   for i in 0 ..< a.limbs.len:
-    a[i] = 0
+    a.limbs[i] = 0
 
 func setSmallInt(a: var StUint, k: Word) =
   ## Set ``a`` to k

From 4660dfe4a4bafe75751790dd46138efab3d311d3 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Sun, 23 Jan 2022 22:45:47 +0100
Subject: [PATCH 19/26] Use littleEndian for limb-endianness: bigEndian arch
 are very rare, untestable in CI, a pain to maintain and an intermediate
 serialization step instead of casting is cheap

---
 README.md                      |   3 +-
 stint/endians2.nim             |  25 ------
 stint/io.nim                   |  95 ++-------------------
 stint/private/datatypes.nim    | 149 ++++++++++-----------------------
 stint/private/uint_addsub.nim  |  33 +++-----
 stint/private/uint_bitwise.nim |  32 +++----
 stint/private/uint_div.nim     |  41 +--------
 stint/private/uint_shift.nim   |  76 +++++------------
 stint/uintops.nim              |  51 ++++++-----
 9 files changed, 134 insertions(+), 371 deletions(-)

diff --git a/README.md b/README.md
index 0a5906c..76b4191 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,7 @@ Main focus:
     - Uint2048 for Ethereum Bloom filters
   - Ease of use:
     - Use traditional `+`, `-`, `+=`, etc operators like on native types
-    - Representation of numbers in memory is the exact same as native types and endianness aware.
-      - In practice that means that interfacing with binary blobs representing numbers from cryptographic    libraries can be done with a `cast` if it represents a Uint256, Uint512, Uint1024, Uint2048.
+    - converting to and from raw byte BigInts (also called octet string in IETF specs)
     - converting to and from Hex
     - converting to and from decimal strings
 
diff --git a/stint/endians2.nim b/stint/endians2.nim
index 20d78bc..ac8796c 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -245,28 +245,3 @@ func fromBytes*[bits: static int](
     result = fromBytesLE(T, x)
   else:
     result = fromBytesBE(T, x)
-
-# TODO: What is the use-case for all the procs below?
-# ------------------------------------------------------------------------------------------
-
-func toBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use toByteArrayBE instead".} =
-  ## Convert a native endian value to big endian. Consider toBytesBE instead
-  ## which may prevent some confusion.
-  if cpuEndian == bigEndian: x
-  else: x.swapBytes
-
-func fromBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use fromBytesBE instead".} =
-  ## Read a big endian value and return the corresponding native endian
-  # there's no difference between this and toBE, except when reading the code
-  toBE(x)
-
-func toLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated.} =
-  ## Convert a native endian value to little endian. Consider toBytesLE instead
-  ## which may prevent some confusion.
-  if cpuEndian == littleEndian: x
-  else: x.swapBytes
-
-func fromLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline, deprecated: "Use fromBytesLE instead".} =
-  ## Read a little endian value and return the corresponding native endian
-  # there's no difference between this and toLE, except when reading the code
-  toLE(x)
diff --git a/stint/io.nim b/stint/io.nim
index 8483a15..c80fc4f 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -33,22 +33,10 @@ template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
 
 func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
   ## Converts an integer to an arbitrary precision integer.
-  when cpuEndian == littleEndian:
-    result.limbs[0] = Word(n)
-    when sizeof(n) > sizeof(Word):
-      result.limbs[1] = Word(n) shr WordBitWidth
-  else:
-    result.limbs[^1] = Word(n)
-    when sizeof(n) > sizeof(Word):
-      result.limbs[^2] = Word(n) shr WordBitWidth
-
-<<<<<<< HEAD
-func to*(x: SomeInteger, T: typedesc[StInt]): T =
-  stint(x, result.bits)
+  result.limbs[0] = Word(n)
+  when sizeof(n) > sizeof(Word):
+    result.limbs[1] = Word(n) shr WordBitWidth
 
-func to*(x: SomeUnsignedInt, T: typedesc[StUint]): T =
-  stuint(x, result.bits)
-=======
 # func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
 #   ## Converts an integer to an arbitrary precision signed integer.
 #
@@ -88,8 +76,8 @@ func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
   ## unsigned int to unsigned int conversion
   ## smaller to bigger bits conversion will have the same value
   ## bigger to smaller bits conversion, the result is truncated
-  for wr, wa in leastToMostSig(result, a):
-    wr = wa
+  for i in 0 ..< result.len:
+    result[i] = a[i]
 
 # func stuint*(a: StInt, bits: static[int]): StUint[bits] {.inline.} =
 #   ## signed int to unsigned int conversion
@@ -377,82 +365,13 @@ func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string
   let bytes = a.toBytes(order)
   result = bytes.toHex()
 
-proc initFromBytesBE*[bits: static[int]](val: var Stuint[bits], 
-                      ba: openarray[byte], 
-                      allowPadding: static[bool] = true) {.deprecated:"Use fromBytesBE instead".}=
-  ## Initializes a UInt[bits] value from a byte buffer storing a big-endian
-  ## representation of a number.
-  ##
-  ## If `allowPadding` is set to false, the input array must be exactly
-  ## (bits div 8) bytes long. Otherwise, it may be shorter and the remaining
-  ## bytes will be assumed to be zero.
-
-  const N = bits div 8
-
-  when not allowPadding:
-    doAssert(ba.len == N)
-  else:
-    doAssert ba.len <= N
-    when system.cpuEndian == bigEndian:
-      let baseIdx = N - val.len
-    else:
-      let baseIdx = ba.len - 1
-
-  when nimvm:
-    when system.cpuEndian == bigEndian:
-      when allowPadding:
-        for i, b in ba: val.data.setByte(baseIdx + i, b)
-      else:
-        for i, b in ba: val.data.setByte(i, b)
-    else:
-      when allowPadding:
-        for i, b in ba: val.data.setByte(baseIdx - i, b)
-      else:
-        for i, b in ba: val.data.setByte(N-1 - i, b)
-  else:
-    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-    let r_ptr {.restrict.} = cast[ptr array[N, byte]](val.addr)
-
-    when system.cpuEndian == bigEndian:
-      # TODO: due to https://github.com/status-im/nim-stint/issues/38
-      # We can't cast a stack byte array to stuint with a convenient proc signature.
-      when allowPadding:
-        for i, b in ba: r_ptr[baseIdx + i] = b
-      else:
-        for i, b in ba: r_ptr[i] = b
-    else:
-      when allowPadding:
-        for i, b in ba: r_ptr[baseIdx - i] = b
-      else:
-        for i, b in ba: r_ptr[N-1 - i] = b
-
-func significantBytesBE*(val: openArray[byte]): int {.deprecated.}=
-  ## Returns the number of significant trailing bytes in a big endian
-  ## representation of a number.
-  # TODO: move that in https://github.com/status-im/nim-byteutils
-  for i in 0 ..< val.len:
-    if val[i] != 0:
-      return val.len - i
-  return 1
-
-func fromBytesBE*(T: type Stuint, ba: openarray[byte],
-                  allowPadding: static[bool] = true): T {.noInit, inline.} =
-  ## This function provides a convenience wrapper around `initFromBytesBE`.
-  when not allowPadding:
-    {.deprecated: "fromBytesBE without padding is deprecated".}
-    result.initFromBytesBE(ba, allowPadding)
-  else:
-    result = endians2.fromBytesBE(T, ba)
-
 func readUintBE*[bits: static[int]](ba: openarray[byte]): Stuint[bits] {.noInit, inline.}=
   ## Convert a big-endian array of (bits div 8) Bytes to an UInt[bits] (in native host endianness)
   ## Input:
   ##   - a big-endian openArray of size (bits div 8) at least
   ## Returns:
   ##   - A unsigned integer of the same size with `bits` bits
-  ##
-  ## ⚠ If the openarray length is bigger than bits div 8, part converted is undefined behaviour.
-  result = endians2.fromBytesBE(Stuint[bits], ba)
+  result = (typeof result).fromBytesBE(ba)
 
 func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] {.noInit, inline.}=
   ## Convert a uint[bits] to to a big-endian array of bits div 8 bytes
@@ -460,7 +379,7 @@ func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte]
   ##   - an unsigned integer
   ## Returns:
   ##   - a big-endian array of the same size
-  result = n.toBytes(bigEndian)
+  result = n.toBytesBE()
 
 template hash*(num: StUint|StInt): Hash =
   # TODO:
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 39947fa..1c43049 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -20,35 +20,22 @@ const WordBitWidth* = sizeof(Word) * 8
 
 func wordsRequired*(bits: int): int {.compileTime.} =
   ## Compute the number of limbs required
-  ## from the **announced** bit length
+  ## for the **announced** bit length
   (bits + WordBitWidth - 1) div WordBitWidth
 
 type
   Limbs*[N: static int] = array[N, Word]
     ## Limbs type
-    ## Large proc like multiplication and division
-    ## should operate at the limb-level
-    ## to avoid duplicate codepaths
-    ## For example for Stuint[16] and Stuint[32]
-    ## or if allowed in the future
-    ## Stuint[254] and Stuint[256]
 
   StUint*[bits: static[int]] = object
     ## Stack-based integer
     ## Unsigned
     limbs*: array[bits.wordsRequired, Word]
-      # TODO: using the limbs type here
-      #       can using StUint[8] of length 2, instead of 1
-      #       in test_uint_bitwise (in the VM)
-      #       unless you put the following instantiation
-      #       at the bottom of this file
-      # static:
-      #   echo StUint[8]()
-
-  StInt*[bits: static[int]] = object
+      # Limbs-Endianess is little-endian
+
+  StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
     ## Stack-based integer
     ## Signed
-    limbs*: array[bits.wordsRequired, Word]
 
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
@@ -62,25 +49,12 @@ when sizeof(int) == 8 and GCC_Compatible:
   type
     uint128*{.importc: "unsigned __int128".} = object
 
-# Accessors
+# Bithacks
 # --------------------------------------------------------
 
-template leastSignificantWord*(num: SomeInteger): auto =
-  num
-
-template leastSignificantWord*(a: SomeBigInteger): auto =
-  when cpuEndian == littleEndian:
-    a.limbs[0]
-  else:
-    a.limbs[^1]
+{.push raises: [], inline, noInit, gcsafe.}
 
-template mostSignificantWord*(a: SomeBigInteger): auto =
-  when cpuEndian == littleEndian:
-    a.limbs[^1]
-  else:
-    a.limbs[0]
-
-template clearExtraBits*(a: var StUint) =
+template clearExtraBitsOverMSB*(a: var StUint) =
   ## A Stuint is stored in an array of 32 of 64-bit word
   ## If we do bit manipulation at the word level,
   ## for example a 8-bit stuint stored in a 64-bit word
@@ -88,64 +62,34 @@ template clearExtraBits*(a: var StUint) =
   when a.bits != a.limbs.len * WordBitWidth:
     const posExtraBits = a.bits - (a.limbs.len-1) * WordBitWidth
     const mask = (Word(1) shl posExtraBits) - 1
-    mostSignificantWord(a) = mostSignificantWord(a) and mask
+    a[^1] = a[^1] and mask
+
+func usedBitsAndWords*(a: openArray[Word]): tuple[bits, words: int] =
+  ## Returns the number of used words and bits in a bigInt
+  var clz = 0
+  # Count Leading Zeros
+  for i in countdown(a.len-1, 0):
+    let count = log2trunc(a[i])
+    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
+    if count == -1:
+      clz += WordBitWidth
+    else:
+      clz += WordBitWidth - count - 1
+      return (a.len*WordBitWidth - clz, i+1)
 
-# Iterations
+{.pop.}
+
+# Accessors
 # --------------------------------------------------------
 
-iterator leastToMostSig*(a: SomeBigInteger): Word =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield a.limbs[i]
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield a.limbs[i]
-
-iterator leastToMostSig*(a: var SomeBigInteger): var Word =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield a.limbs[i]
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield a.limbs[i]
-
-iterator leastToMostSig*(a, b: SomeBigInteger): (Word, Word) =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield (a.limbs[i], b.limbs[i])
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield (a.limbs[i], b.limbs[i])
-
-iterator leastToMostSig*[aBits, bBits](a: var SomeBigInteger[aBits], b: SomeBigInteger[bBits]): (var Word, Word) =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< min(a.limbs.len, b.limbs.len):
-      yield (a.limbs[i], b.limbs[i])
-  else:
-    for i in countdown(min(a.limbs.len, b.limbs.len)-1, 0):
-      yield (a.limbs[i], b.limbs[i])
-
-iterator leastToMostSig*(c: var SomeBigInteger, a, b: SomeBigInteger): (var Word, Word, Word) =
-  ## Iterate from least to most significant word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.limbs.len:
-      yield (c.limbs[i], a.limbs[i], b.limbs[i])
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield (c.limbs[i], a.limbs[i], b.limbs[i])
-
-iterator mostToLeastSig*(a: SomeBigInteger): Word =
-  ## Iterate from most to least significant word
-  when cpuEndian == bigEndian:
-    for i in 0 ..< a.limbs.len:
-      yield a.limbs[i]
-  else:
-    for i in countdown(a.limbs.len-1, 0):
-      yield a.limbs[i]
+template `[]`*(a: SomeBigInteger, i: SomeInteger or BackwardsIndex): Word =
+  a.limbs[i]
+
+template `[]=`*(a: var SomeBigInteger, i: SomeInteger or BackwardsIndex, val: Word) =
+  a.limbs[i] = val
+
+# Iterations
+# --------------------------------------------------------
 
 import std/macros
 
@@ -179,20 +123,15 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
 
 # Copy
 # --------------------------------------------------------
-
-func copyFrom*(
-        dst: var SomeBigInteger,
-        src: SomeBigInteger
-      ){.inline.} =
-  ## Copy a BigInteger, truncated to 2^slen if the source
-  ## is larger than the destination
-  when cpuEndian == littleEndian:
-    for i in 0 ..< min(dst.limbs.len, src.limbs.len):
-      dst.limbs[i] = src.limbs[i]
-    for i in src.limbs.len ..< dst.limbs.len:
-      dst.limbs[i] = 0
-  else:
-    for i in countdown(dst.limbs.len-1, src.limbs.len):
-      dst.limbs[i] = 0
-    for i in countdown(src.limbs.len-1, 0):
-      dst.limbs[i] = src.limbs[i]
+{.push raises: [], inline, noInit, gcsafe.}
+
+func copyWords*(
+       a: var openArray[Word], startA: int,
+       b: openArray[Word], startB: int,
+       numWords: int) =
+  ## Copy a slice of B into A. This properly deals
+  ## with overlaps when A and B are slices of the same buffer
+  for i in countdown(numWords-1, 0):
+    a[startA+i] = b[startB+i]
+
+{.pop.}
\ No newline at end of file
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index 6821d75..3cd4909 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -19,40 +19,31 @@ import
 func sum*(r: var Stuint, a, b: Stuint) =
   ## Addition for multi-precision unsigned int
   var carry = Carry(0)
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    addC(carry, wr, wa, wb, carry)
-  r.clearExtraBits()
+  for i in 0 ..< r.limbs.len:
+    addC(carry, r[i], a[i], b[i], carry)
+  r.clearExtraBitsOverMSB()
 
 func `+=`*(a: var Stuint, b: Stuint) =
   ## In-place addition for multi-precision unsigned int
-  var carry = Carry(0)
-  for wa, wb in leastToMostSig(a, b):
-    addC(carry, wa, wa, wb, carry)
-  a.clearExtraBits()
+  a.sum(a, b)
 
 func diff*(r: var Stuint, a, b: Stuint) =
   ## Substraction for multi-precision unsigned int
   var borrow = Borrow(0)
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    subB(borrow, wr, wa, wb, borrow)
-  r.clearExtraBits()
+  for i in 0 ..< r.limbs.len:
+    subB(borrow, r[i], a[i], b[i], borrow)
+  r.clearExtraBitsOverMSB()
 
 func `-=`*(a: var Stuint, b: Stuint) =
   ## In-place substraction for multi-precision unsigned int
-  var borrow = Borrow(0)
-  for wa, wb in leastToMostSig(a, b):
-    subB(borrow, wa, wa, wb, borrow)
-  a.clearExtraBits()
+  a.diff(a, b)
 
 func inc*(a: var Stuint, w: Word = 1) =
   var carry = Carry(0)
-  when cpuEndian == littleEndian:
-    addC(carry, a.limbs[0], a.limbs[0], w, carry)
-    for i in 1 ..< a.limbs.len:
-      addC(carry, a.limbs[i], a.limbs[i], 0, carry)
-  else:
-    {.error: "Not implemented.".}
-  a.clearExtraBits()
+  addC(carry, a.limbs[0], a.limbs[0], w, carry)
+  for i in 1 ..< a.limbs.len:
+    addC(carry, a.limbs[i], a.limbs[i], 0, carry)
+  a.clearExtraBitsOverMSB()
 
 func sum*(r: var Stuint, a: Stuint, b: SomeUnsignedInt) =
   ## Addition for multi-precision unsigned int
diff --git a/stint/private/uint_bitwise.nim b/stint/private/uint_bitwise.nim
index 587b7a4..a3ce42b 100644
--- a/stint/private/uint_bitwise.nim
+++ b/stint/private/uint_bitwise.nim
@@ -20,30 +20,30 @@ import
 func bitnot*(r: var StUint, a: Stuint) =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
-  for wr, wa in leastToMostSig(r, a):
-    wr = not wa
-  r.clearExtraBits()
+  for i in 0 ..< r.len:
+    r[i] = not a[i]
+  r.clearExtraBitsOverMSB()
 
 func bitor*(r: var Stuint, a, b: Stuint) =
   ## `Bitwise or` of numbers a and b
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    wr = wa or wb
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] or b[i]
 
 func bitand*(r: var Stuint, a, b: Stuint) =
   ## `Bitwise and` of numbers a and b
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    wr = wa and wb
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] and b[i]
 
 func bitxor*(r: var Stuint, a, b: Stuint) =
   ## `Bitwise xor` of numbers x and y
-  for wr, wa, wb in leastToMostSig(r, a, b):
-    wr = wa xor wb
-  r.clearExtraBits()
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] xor b[i]
+  r.clearExtraBitsOverMSB()
 
 func countOnes*(a: Stuint): int =
   result = 0
-  for wa in leastToMostSig(a):
-    result += countOnes(wa)
+  for i in 0 ..< a.limbs.len:
+    result += countOnes(a[i])
 
 func parity*(a: Stuint): int =
   result = parity(a.limbs[0])
@@ -56,8 +56,8 @@ func leadingZeros*(a: Stuint): int =
   # Adjust when we use only part of the word size
   var extraBits = WordBitWidth * a.limbs.len - a.bits
 
-  for word in mostToLeastSig(a):
-    let zeroCount = word.leadingZeros()
+  for i in countdown(a.len-1, 0):
+    let zeroCount = a.limbs[i].leadingZeros()
     if extraBits > 0:
       result += zeroCount - min(extraBits, WordBitWidth)
       extraBits -= WordBitWidth
@@ -68,8 +68,8 @@ func leadingZeros*(a: Stuint): int =
 
 func trailingZeros*(a: Stuint): int =
   result = 0
-  for word in leastToMostSig(a):
-    let zeroCount = word.trailingZeros()
+  for i in 0 ..< a.limbs.len:
+    let zeroCount = a[i].trailingZeros()
     result += zeroCount
     if zeroCount != WordBitWidth:
       break
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index 333234c..bb7ca5d 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -15,31 +15,6 @@ import
   ./uint_bitwise,
   ./primitives/[addcarry_subborrow, extended_precision]
 
-# Helpers
-# --------------------------------------------------------
-
-func usedBitsAndWords(a: openArray[Word]): tuple[bits, words: int] {.inline.} =
-  ## Returns the number of used words and bits in a bigInt
-  var clz = 0
-  # Count Leading Zeros
-  for i in countdown(a.len-1, 0):
-    let count = log2trunc(a[i])
-    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
-    if count == -1:
-      clz += WordBitWidth
-    else:
-      clz += WordBitWidth - count - 1
-      return (a.len*WordBitWidth - clz, i+1)
-
-func copyWords(
-       a: var openArray[Word], startA: int,
-       b: openArray[Word], startB: int,
-       numWords: int) =
-  ## Copy a slice of B into A. This properly deals
-  ## with overlaps when A and B are slices of the same buffer
-  for i in countdown(numWords-1, 0):
-    a[startA+i] = b[startB+i]
-
 # Division
 # --------------------------------------------------------
 
@@ -312,7 +287,7 @@ func shlAddMod(a: var openArray[Word], c: Word,
   else:
     return shlAddMod_multi(a, c, M, mBits)
 
-func divRemImpl(
+func divRem*(
        q, r: var openArray[Word],
        a, b: openArray[Word]
      ) =
@@ -350,20 +325,6 @@ func divRemImpl(
     for i in rLen ..< r.len:
       r[i] = 0
 
-func `div`*(x, y: Stuint): Stuint {.inline.} =
-  ## Division operation for multi-precision unsigned uint
-  var tmp{.noInit.}: Stuint
-  divRemImpl(result.limbs, tmp.limbs, x.limbs, y.limbs)
-
-func `mod`*(x, y: Stuint): Stuint {.inline.} =
-  ## Remainder operation for multi-precision unsigned uint
-  var tmp{.noInit.}: Stuint
-  divRemImpl(tmp.limbs, result.limbs, x.limbs, y.limbs)
-
-func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
-  ## Division and remainder operations for multi-precision unsigned uint
-  divRemImpl(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
-
 # ######################################################################
 # Division implementations
 #
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index 3d50abf..a0181c2 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -25,14 +25,9 @@ func shrSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   #       instead of a[i-1] and a[i]
   #       is probably easier to parallelize for the compiler
   #       (antidependence WAR vs loop-carried dependence RAW)
-  when cpuEndian == littleEndian:
-    for i in 0 ..< a.len-1:
-      r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
-    r[^1] = a[^1] shr k
-  else:
-    for i in countdown(a.len-1, 1):
-      r[i] = (a[i] shr k) or (a[i-1] shl (WordBitWidth - k))
-    r[0] = a[0] shr k
+  for i in 0 ..< a.len-1:
+    r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+  r[^1] = a[^1] shr k
 
 func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   ## Shift right by `w` words + `shift` bits
@@ -40,40 +35,24 @@ func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   if w > Limbs.len:
     return
 
-  when cpuEndian == littleEndian:
-    for i in w ..< a.len-1:
-      r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
-    r[^(1+w)] = a[^1] shr shift
-  else:
-    for i in countdown(a.len-1, 1+w):
-      r[i-w] = (a[i] shr shift) or (a[i-1] shl (WordBitWidth - k))
-    r[0] = a[w] shr shift
+  for i in w ..< a.len-1:
+    r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
+  r[^(1+w)] = a[^1] shr shift
 
 func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   ## Shift right by w word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< Limbs.len-w:
-      r[i] = a[i+w]
-    for i in Limbs.len-w ..< Limbs.len:
-      r[i] = 0
-  else:
-    for i in countdown(Limbs.len-1, Limbs.len-w):
-      r[i] = 0
-    for i in countdown(Limbs.len-w, 0):
-      r[i] = a[i+w]
+  for i in 0 ..< Limbs.len-w:
+    r[i] = a[i+w]
+  for i in Limbs.len-w ..< Limbs.len:
+    r[i] = 0
 
 func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
   ## Compute the `shift left` operation of x and k
   ##
   ## k MUST be less than the base word size (2^32 or 2^64)
-  when cpuEndian == littleEndian:
-    r[0] = a[0] shl k
-    for i in 1 ..< a.len:
-      r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
-  else:
-    r[^1] = a[^1] shl k
-    for i in countdown(a.len-2, 0):
-      r[i] = (a[i] shl k) or (a[i+1] shr (WordBitWidth - k))
+  r[0] = a[0] shl k
+  for i in 1 ..< a.len:
+    r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
 
 func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   ## Shift left by `w` words + `shift` bits
@@ -81,27 +60,16 @@ func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
   if w > Limbs.len:
     return
 
-  when cpuEndian == littleEndian:
-    r[w] = a[0] shl shift
-    for i in 1+w ..< r.len:
-      r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
-  else:
-    r[^1] = a[^w] shl shift
-    for i in countdown(a.len-2-w, 0):
-      r[i+w] = (a[i] shl shift) or (a[i+1] shr (WordBitWidth - shift))
+  r[w] = a[0] shl shift
+  for i in 1+w ..< r.len:
+    r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
 
 func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
   ## Shift left by w word
-  when cpuEndian == littleEndian:
-    for i in 0 ..< w:
-      r[i] = 0
-    for i in 0 ..< Limbs.len-w:
-      r[i+w] = a[i]
-  else:
-    for i in countdown(Limbs.len-1, Limbs.len-w):
-      r[i] = 0
-    for i in countdown(Limbs.len-w-1, 0):
-      r[i] = a[i-w]
+  for i in 0 ..< w:
+    r[i] = 0
+  for i in 0 ..< Limbs.len-w:
+    r[i+w] = a[i]
 
 # Wrappers
 # --------------------------------------------------------
@@ -133,7 +101,7 @@ func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
   
   if k < WordBitWidth:
     r.limbs.shlSmall(a.limbs, k)
-    r.clearExtraBits()
+    r.clearExtraBitsOverMSB()
     return
 
   # w = k div WordBitWidth, shift = k mod WordBitWidth
@@ -145,4 +113,4 @@ func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
   else:
     r.limbs.shlLarge(a.limbs, w, shift)
 
-  r.clearExtraBits()
+  r.clearExtraBitsOverMSB()
diff --git a/stint/uintops.nim b/stint/uintops.nim
index f196bd2..681616f 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -30,14 +30,9 @@ func setZero*(a: var StUint) =
 
 func setSmallInt(a: var StUint, k: Word) =
   ## Set ``a`` to k
-  when cpuEndian == littleEndian:
-    a.limbs[0] = k
-    for i in 1 ..< a.limbs.len:
-      a.limbs[i] = 0
-  else:
-    a.limbs[^1] = k
-    for i in 0 ..< a.limb.len - 1:
-      a.limbs[i] = 0
+  a.limbs[0] = k
+  for i in 1 ..< a.limbs.len:
+    a.limbs[i] = 0
 
 func setOne*(a: var StUint) =
   setSmallInt(a, 1)
@@ -51,8 +46,9 @@ func one*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
   result.setOne()
 
 func high*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
-  for wr in leastToMostSig(result):
-    wr = high(Word)
+  for i in 0 ..< result.len:
+    result[i] = high(Word)
+
 func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
   discard
 
@@ -62,15 +58,15 @@ func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
 {.push raises: [], inline, noInit, gcsafe.}
 
 func isZero*(a: Stuint): bool =
-  for word in leastToMostSig(a):
-    if word != 0:
+  for i in 0 ..< a.limbs.len:
+    if a[i] != 0:
       return false
   return true
 
 func `==`*(a, b: Stuint): bool {.inline.} =
   ## Unsigned `equal` comparison
-  for wa, wb in leastToMostSig(a, b):
-    if wa != wb:
+  for i in 0 ..< a.limbs.len:
+    if a[i] != b[i]:
       return false
   return true
 
@@ -78,8 +74,8 @@ func `<`*(a, b: Stuint): bool {.inline.} =
   ## Unsigned `less than` comparison
   var diff: Word
   var borrow: Borrow
-  for wa, wb in leastToMostSig(a, b):
-    subB(borrow, diff, wa, wb, borrow)
+  for i in 0 ..< a.limbs.len:
+    subB(borrow, diff, a[i], b[i], borrow)
   return bool(borrow)
 
 func `<=`*(a, b: Stuint): bool {.inline.} =
@@ -89,12 +85,12 @@ func `<=`*(a, b: Stuint): bool {.inline.} =
 func isOdd*(a: Stuint): bool {.inline.} =
   ## Returns true if input is off
   ## false otherwise
-  bool(a.leastSignificantWord and 1)
+  bool(a[0] and 1)
 
 func isEven*(a: Stuint): bool {.inline.} =
   ## Returns true if input is zero
   ## false otherwise
-  not a.isOdd
+  not a.isOdd()
 
 {.pop.}
 # Bitwise operations
@@ -178,7 +174,7 @@ export `+=`
 func `*`*(a, b: Stuint): Stuint =
   ## Integer multiplication
   result.limbs.prod(a.limbs, b.limbs)
-  result.clearExtraBits()
+  result.clearExtraBitsOverMSB()
 
 {.pop.}
 
@@ -228,5 +224,20 @@ func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
 
 # Division & Modulo
 # --------------------------------------------------------
+{.push raises: [], inline, noInit, gcsafe.}
+
+func `div`*(x, y: Stuint): Stuint =
+  ## Division operation for multi-precision unsigned uint
+  var tmp{.noInit.}: Stuint
+  divRem(result.limbs, tmp.limbs, x.limbs, y.limbs)
+
+func `mod`*(x, y: Stuint): Stuint =
+  ## Remainder operation for multi-precision unsigned uint
+  var tmp{.noInit.}: Stuint
+  divRem(tmp.limbs, result.limbs, x.limbs, y.limbs)
+
+func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
+  ## Division and remainder operations for multi-precision unsigned uint
+  divRem(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
 
-export uint_div
\ No newline at end of file
+{.pop.}
\ No newline at end of file

From 27e9c9e441b356ed05bd72872fdf62cf577ef028 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Tue, 25 Jan 2022 23:12:52 +0100
Subject: [PATCH 20/26] Add randomized testing, harden against edge cases

---
 benchmarks/bench.nim                          |  91 ++++++
 benchmarks/bench_mod.nim                      |  86 +++---
 helpers/prng_unsafe.nim                       | 260 ++++++++++++++++++
 stint.nimble                                  |  38 +--
 stint/private/datatypes.nim                   |   2 +
 stint/private/uint_div.nim                    |  11 +
 tests/all_tests.nim                           |   4 +-
 tests/property_based.nim                      | 238 ----------------
 tests/t_randomized_divmod.nim                 |  46 ++++
 ...ed_uint256.nim => test_uint256_ttmath.nim} |   4 -
 10 files changed, 480 insertions(+), 300 deletions(-)
 create mode 100644 benchmarks/bench.nim
 create mode 100644 helpers/prng_unsafe.nim
 delete mode 100644 tests/property_based.nim
 create mode 100644 tests/t_randomized_divmod.nim
 rename tests/{property_based_uint256.nim => test_uint256_ttmath.nim} (99%)

diff --git a/benchmarks/bench.nim b/benchmarks/bench.nim
new file mode 100644
index 0000000..36d9a55
--- /dev/null
+++ b/benchmarks/bench.nim
@@ -0,0 +1,91 @@
+import ../stint, std/[times, monotimes]
+
+template bench(desc: string, body: untyped) =
+  let start = getMonotime()
+  body
+  let stop = getMonotime()
+  echo desc,": ", inMilliseconds(stop-start), " ms"
+
+# Warmup on normal int to ensure max CPU freq
+# Complex enough that the compiler doesn't optimize it away
+
+proc warmup() =
+  var foo = 123
+  bench "Warmup":
+    for i in 0 ..< 10_000_000:
+      foo += i*i mod 456
+      foo = foo mod 789
+
+warmup()
+####################################
+
+let a = [123'u64, 123'u64, 123'u64, 123'u64]
+let m = [456'u64, 456'u64, 456'u64, 45'u64]
+
+proc add_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Add (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += mU256
+      foo += aU256
+
+proc mul_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Mul (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += (foo * foo)
+
+proc mod_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Mod (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += (foo * foo) mod mU256
+
+add_stint(a, m)
+mul_stint(a, m)
+mod_stint(a, m)
+
+when defined(bench_ttmath):
+  # need C++
+  import ttmath, ../tests/ttmath_compat
+
+  proc add_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
+
+    bench "Add (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += mU256
+        foo += aU256
+
+  proc mul_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
+
+    bench "Mul (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += (foo * foo)
+
+  proc mod_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
+
+    bench "Mod (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += (foo * foo) mod mU256
+
+  add_ttmath(a, m)
+  mul_ttmath(a, m)
+  mod_ttmath(a, m)
\ No newline at end of file
diff --git a/benchmarks/bench_mod.nim b/benchmarks/bench_mod.nim
index 57fabb7..2ec97e7 100644
--- a/benchmarks/bench_mod.nim
+++ b/benchmarks/bench_mod.nim
@@ -1,59 +1,69 @@
-import ../stint, times
+import ../stint, std/[times, monotimes]
 
+template bench(desc: string, body: untyped) =
+  let start = getMonotime()
+  body
+  let stop = getMonotime()
+  echo desc,": ", inMilliseconds(stop-start), " ms"
 
-# Warmup on normal int
-var start = cpuTime()
-block:
-  var foo = 123
-  for i in 0 ..< 10_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
+# Warmup on normal int to ensure max CPU freq
+# Complex enough that the compiler doesn't optimize it away
 
-# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-var stop = cpuTime()
-echo "Warmup: " & $(stop - start) & "s"
+proc warmup() =
+  var foo = 123
+  bench "Warmup":
+    for i in 0 ..< 10_000_000:
+      foo += i*i mod 456
+      foo = foo mod 789
 
+warmup()
 ####################################
 
 let a = [123'u64, 123'u64, 123'u64, 123'u64]
 let m = [456'u64, 456'u64, 456'u64, 45'u64]
 
-let aU256 = cast[Stuint[256]](a)
-let mU256 = cast[Stuint[256]](m)
+proc mul_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Mul (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += (foo * foo)
+
+proc mod_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
 
-start = cpuTime()
-block:
-  var foo = aU256
-  for i in 0 ..< 10_000_000:
-    foo += (foo * foo) mod mU256
+  bench "Mod (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += (foo * foo) mod mU256
 
-stop = cpuTime()
-echo "Library: " & $(stop - start) & "s"
+mul_stint(a, m)
+mod_stint(a, m)
 
 when defined(bench_ttmath):
   # need C++
   import ttmath, ../tests/ttmath_compat
 
-  template tt_u256(a: int): UInt[256] = ttmath.u256(a.uint)
-
-  start = cpuTime()
-  block:
-    var foo = a.astt()
+  proc mul_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
     let mU256 = m.astt()
-    for i in 0 ..< 10_000_000:
-      foo += (foo * foo) mod mU256
 
-  stop = cpuTime()
-  echo "TTMath: " & $(stop - start) & "s"
+    bench "Mul (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += (foo * foo)
 
-# On my i5-5257 broadwell with the flags:
-# nim c -d:release -d:bench_ttmath
-# Warmup: 0.04060799999999999s
-# Library: 0.9576759999999999s
-# TTMath: 0.758443s
+  proc mod_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
 
+    bench "Mod (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += (foo * foo) mod mU256
 
-# After PR #54 for compile-time evaluation
-# which includes loop unrolling but may bloat the code
-# Warmup: 0.03993500000000001s
-# Library: 0.848464s
+  mul_ttmath(a, m)
+  mod_ttmath(a, m)
\ No newline at end of file
diff --git a/helpers/prng_unsafe.nim b/helpers/prng_unsafe.nim
new file mode 100644
index 0000000..dba9758
--- /dev/null
+++ b/helpers/prng_unsafe.nim
@@ -0,0 +1,260 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../stint/io,
+  ../stint/private/datatypes
+
+# ############################################################
+#
+#              Pseudo-Random Number Generator
+#          for testing and benchmarking purposes
+#
+# ############################################################
+#
+# The recommendation by Vigna at http://prng.di.unimi.it
+# is to have a period of t^2 if we need t values (i.e. about 2^1024)
+# but also that for all practical purposes 2^256 period is enough
+#
+# We use 2^512 since our main use-case is uint256
+
+type RngState* = object
+  ## This is the state of a Xoshiro512** PRNG
+  ## Unsafe: for testing and benchmarking purposes only
+  s: array[8, uint64]
+
+func splitMix64(state: var uint64): uint64 =
+  state += 0x9e3779b97f4a7c15'u64
+  result = state
+  result = (result xor (result shr 30)) * 0xbf58476d1ce4e5b9'u64
+  result = (result xor (result shr 27)) * 0xbf58476d1ce4e5b9'u64
+  result = result xor (result shr 31)
+
+func seed*(rng: var RngState, x: SomeInteger) =
+  ## Seed the random number generator with a fixed seed
+  var sm64 = uint64(x)
+  rng.s[0] = splitMix64(sm64)
+  rng.s[1] = splitMix64(sm64)
+  rng.s[2] = splitMix64(sm64)
+  rng.s[3] = splitMix64(sm64)
+  rng.s[4] = splitMix64(sm64)
+  rng.s[5] = splitMix64(sm64)
+  rng.s[6] = splitMix64(sm64)
+  rng.s[7] = splitMix64(sm64)
+
+func rotl(x: uint64, k: static int): uint64 {.inline.} =
+  return (x shl k) or (x shr (64 - k))
+
+template `^=`(x: var uint64, y: uint64) =
+  x = x xor y
+
+func next(rng: var RngState): uint64 =
+  ## Compute a random uint64 from the input state
+  ## using xoshiro512** algorithm by Vigna et al
+  ## State is updated.
+  result = rotl(rng.s[1] * 5, 7) * 9
+
+  let t = rng.s[1] shl 11
+  rng.s[2] ^= rng.s[0];
+  rng.s[5] ^= rng.s[1];
+  rng.s[1] ^= rng.s[2];
+  rng.s[7] ^= rng.s[3];
+  rng.s[3] ^= rng.s[4];
+  rng.s[4] ^= rng.s[5];
+  rng.s[0] ^= rng.s[6];
+  rng.s[6] ^= rng.s[7];
+
+  rng.s[6] ^= t;
+
+  rng.s[7] = rotl(rng.s[7], 21);
+
+# Bithacks
+# ------------------------------------------------------------
+
+proc clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} =
+  ## Returns ``v``, with all the ``1`` bits from ``mask`` set to 0
+  v and not mask
+
+proc clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} =
+  ## Returns ``v``, with the bit at position ``bit`` set to 0
+  v.clearMask(1.T shl bit)
+
+# Integer ranges
+# ------------------------------------------------------------
+
+func random_unsafe*(rng: var RngState, maxExclusive: uint32): uint32 =
+  ## Generate a random integer in 0 ..< maxExclusive
+  ## Uses an unbiaised generation method
+  ## See Lemire's algorithm modified by Melissa O'Neill
+  ##   https://www.pcg-random.org/posts/bounded-rands.html
+  let max = maxExclusive
+  var x = uint32 rng.next()
+  var m = x.uint64 * max.uint64
+  var l = uint32 m
+  if l < max:
+    var t = not(max) + 1 # -max
+    if t >= max:
+      t -= max
+      if t >= max:
+        t = t mod max
+    while l < t:
+      x = uint32 rng.next()
+      m = x.uint64 * max.uint64
+      l = uint32 m
+  return uint32(m shr 32)
+
+func random_unsafe*[T: SomeInteger](rng: var RngState, inclRange: Slice[T]): T =
+  ## Return a random integer in the given range.
+  ## The range bounds must fit in an int32.
+  let maxExclusive = inclRange.b + 1 - inclRange.a
+  result = T(rng.random_unsafe(uint32 maxExclusive))
+  result += inclRange.a
+
+# Containers
+# ------------------------------------------------------------
+
+func sample_unsafe*[T](rng: var RngState, src: openarray[T]): T =
+  ## Return a random sample from an array
+  result = src[rng.random_unsafe(uint32 src.len)]
+
+# BigInts
+# ------------------------------------------------------------
+#
+# Statistics note:
+# - A skewed distribution is not symmetric, it has a longer tail in one direction.
+#   for example a RNG that is not centered over 0.5 distribution of 0 and 1 but
+#   might produces more 1 than 0 or vice-versa.
+# - A bias is a result that is consistently off from the true value i.e.
+#   a deviation of an estimate from the quantity under observation
+
+func random_unsafe(rng: var RngState, a: var SomeBigInteger) =
+  ## Initialize a standalone BigInt
+  for i in 0 ..< a.limbs.len:
+    a.limbs[i] = Word(rng.next())
+
+func random_word_highHammingWeight(rng: var RngState): Word =
+  let numZeros = rng.random_unsafe(WordBitWidth div 3) # Average Hamming Weight is 1-0.33/2 = 0.83
+  result = high(Word)
+  for _ in 0 ..< numZeros:
+    result = result.clearBit rng.random_unsafe(WordBitWidth)
+
+func random_highHammingWeight(rng: var RngState, a: var SomeBigInteger) =
+  ## Initialize a standalone BigInt
+  ## with high Hamming weight
+  ## to have a higher probability of triggering carries
+  for i in 0 ..< a.limbs.len:
+    a.limbs[i] = Word rng.random_word_highHammingWeight()
+
+func random_long01Seq(rng: var RngState, a: var openArray[byte]) =
+  ## Initialize a bytearray
+  ## It is skewed towards producing strings of 1111... and 0000
+  ## to trigger edge cases
+  # See libsecp256k1: https://github.com/bitcoin-core/secp256k1/blob/dbd41db1/src/testrand_impl.h#L90-L104
+  let Bits = a.len * 8
+  var bit = 0
+  zeroMem(a[0].addr, a.len)
+  while bit < Bits :
+    var now = 1 + (rng.random_unsafe(1 shl 6) * rng.random_unsafe(1 shl 5) + 16) div 31
+    let val = rng.sample_unsafe([0, 1])
+    while now > 0 and bit < Bits:
+      a[bit shr 3] = a[bit shr 3] or byte(val shl (bit and 7))
+      dec now
+      inc bit
+
+func random_long01Seq(rng: var RngState, a: var SomeBigInteger) =
+  ## Initialize a bigint
+  ## It is skewed towards producing strings of 1111... and 0000
+  ## to trigger edge cases
+  var buf: array[(a.bits + 7) div 8, byte]
+  rng.random_long01Seq(buf)
+  a = (typeof a).fromBytesBE(buf)
+
+# Byte sequences
+# ------------------------------------------------------------
+
+func random_byte_seq*(rng: var RngState, length: int): seq[byte] =
+  result.newSeq(length)
+  for b in result.mitems:
+    b = byte rng.next()
+
+# Generic over any Stint type
+# ------------------------------------------------------------
+
+func random_unsafe*(rng: var RngState, T: typedesc): T =
+  ## Create a random Field or Extension Field or Curve Element
+  ## Unsafe: for testing and benchmarking purposes only
+  rng.random_unsafe(result)
+
+func random_highHammingWeight*(rng: var RngState, T: typedesc): T =
+  ## Create a random Field or Extension Field or Curve Element
+  ## Skewed towards high Hamming Weight
+  rng.random_highHammingWeight(result)
+
+func random_long01Seq*(rng: var RngState, T: typedesc): T =
+  ## Create a random Field or Extension Field or Curve Element
+  ## Skewed towards long bitstrings of 0 or 1
+  rng.random_long01Seq(result)
+
+type
+  RandomGen* = enum
+    Uniform
+    HighHammingWeight
+    Long01Sequence
+
+func random_elem*(rng: var RngState, T: typedesc, gen: RandomGen): T {.inline, noInit.} =
+  case gen
+  of Uniform:
+    result = rng.random_unsafe(T)
+  of HighHammingWeight:
+    result = rng.random_highHammingWeight(T)
+  of Long01Sequence:
+    result = rng.random_long01Seq(T)
+
+# Sanity checks
+# ------------------------------------------------------------
+
+when isMainModule:
+  import std/[tables, times, strutils]
+
+  var rng: RngState
+  let timeSeed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(timeSeed)
+  echo "prng_sanity_checks xoshiro512** seed: ", timeSeed
+
+
+  proc test[T](s: Slice[T]) =
+    var c = initCountTable[int]()
+
+    for _ in 0 ..< 1_000_000:
+      c.inc(rng.random_unsafe(s))
+
+    echo "1'000'000 pseudo-random outputs from ", s.a, " to ", s.b, " (incl): ", c
+
+  test(0..1)
+  test(0..2)
+  test(1..52)
+  test(-10..10)
+
+  echo "\n-----------------------------\n"
+  echo "High Hamming Weight check"
+  for _ in 0 ..< 10:
+    let word = rng.random_word_highHammingWeight()
+    echo "0b", cast[BiggestInt](word).toBin(WordBitWidth), " - 0x", word.toHex()
+
+  echo "\n-----------------------------\n"
+  echo "Long strings of 0 or 1 check"
+  for _ in 0 ..< 10:
+    var a: BigInt[127]
+    rng.random_long01seq(a)
+    stdout.write "0b"
+    for word in a.limbs:
+      stdout.write cast[BiggestInt](word).toBin(WordBitWidth)
+    stdout.write " - 0x"
+    for word in a.limbs:
+      stdout.write word.BaseType.toHex()
+    stdout.write '\n'
diff --git a/stint.nimble b/stint.nimble
index 3eeacfd..6f67392 100644
--- a/stint.nimble
+++ b/stint.nimble
@@ -14,6 +14,7 @@ requires "nim >= 1.6.0",
 proc test(args, path: string) =
   if not dirExists "build":
     mkDir "build"
+
   exec "nim " & getEnv("TEST_LANG", "c") & " " & getEnv("NIMFLAGS") & " " & args &
     " --outdir:build -r --hints:off --warnings:off --skipParentCfg" &
     " --styleCheck:usages --styleCheck:error " & path
@@ -22,23 +23,22 @@ proc test(args, path: string) =
       " --outdir:build -r --mm:refc --hints:off --warnings:off --skipParentCfg" &
       " --styleCheck:usages --styleCheck:error " & path
 
-task test, "Run all tests - test and production implementation":
-  # Run tests for internal procs - test implementation (StUint[64] = 2x uint32
-  test "-d:stint_test", "tests/internal.nim"
-  # Run tests for internal procs - prod implementation (StUint[64] = uint64
-  test "", "tests/internal.nim"
-  # Run all tests - test implementation (StUint[64] = 2x uint32
-  test "-d:stint_test", "tests/all_tests.nim"
-  # Run all tests - prod implementation (StUint[64] = uint64
-  test "--threads:on", "tests/all_tests.nim"
+task test_internal, "Run tests for internal procs":
+  test "internal"
+
+task test_public_api, "Run all tests - prod implementation (StUint[64] = uint64":
+  test "all_tests"
 
-  ## quicktest-0.20.0/quicktest.nim(277, 30) Error: cannot evaluate at compile time: BUILTIN_NAMES
-  ##
-  # # Run random tests (debug mode) - test implementation (StUint[64] = 2x uint32)
-  # test "-d:stint_test", "tests/property_based.nim"
-  # # Run random tests (release mode) - test implementation (StUint[64] = 2x uint32)
-  # test "-d:stint_test -d:release", "tests/property_based.nim"
-  # # Run random tests Uint256 (debug mode) vs TTMath (StUint[256] = 8 x uint32)
-  # test "", "tests/property_based.nim"
-  # # Run random tests Uint256 (release mode) vs TTMath (StUint[256] = 4 x uint64)
-  # test "-d:release", "tests/property_based.nim"
+task test_uint256_ttmath, "Run random tests Uint256 vs TTMath":
+  requires "https://github.com/alehander42/nim-quicktest >= 0.18.0", "https://github.com/status-im/nim-ttmath"
+  switch("define", "release")
+  test "uint256_ttmath", "cpp"
+
+task test, "Run all tests - test and production implementation":
+  exec "nimble test_internal"
+  exec "nimble test_public_api"
+  ## TODO test only requirements don't work: https://github.com/nim-lang/nimble/issues/482
+  # exec "nimble test_property_debug"
+  # exec "nimble test_property_release"
+  # exec "nimble test_property_uint256_debug"
+  # exec "nimble test_property_uint256_release"
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 1c43049..1792304 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -66,6 +66,7 @@ template clearExtraBitsOverMSB*(a: var StUint) =
 
 func usedBitsAndWords*(a: openArray[Word]): tuple[bits, words: int] =
   ## Returns the number of used words and bits in a bigInt
+  ## Returns (0, 0) for all-zeros array (even if technically you need 1 bit and 1 word to encode zero)
   var clz = 0
   # Count Leading Zeros
   for i in countdown(a.len-1, 0):
@@ -76,6 +77,7 @@ func usedBitsAndWords*(a: openArray[Word]): tuple[bits, words: int] =
     else:
       clz += WordBitWidth - count - 1
       return (a.len*WordBitWidth - clz, i+1)
+  return (0, 0)
 
 {.pop.}
 
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index bb7ca5d..960d02e 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -29,6 +29,13 @@ func shortDiv*(a: var Limbs, k: Word): Word =
     # dividend = 2^64 * remainder + a[i]
     var hi = result
     var lo = a[i]
+    if hi == 0:
+      if lo < k:
+        a[i] = 0
+      elif lo == k:
+        a[i] = 1
+        result = 0
+      continue
     # Normalize, shifting the remainder by clz(k) cannot overflow.
     hi = (hi shl clz) or (lo shr (WordBitWidth - clz))
     lo = lo shl clz
@@ -216,6 +223,7 @@ func divmod(q, r: var Stuint,
     q = high(Word)                      # quotient = MaxWord (0b1111...1111)
   elif a0 == 0 and a1 < m0:             # elif q == 0, true quotient = 0
     q = 0
+    return q
   else:
     var r: Word
     div2n1n(q, r, a0, a1, m0)           # else instead of being of by 0, 1 or 2
@@ -295,6 +303,9 @@ func divRem*(
   let (bBits, bLen) = usedBitsAndWords(b)
   let rLen = bLen
 
+  if unlikely(bBits == 0):
+    raise newException(DivByZeroError, "You attempted to divide by zero")
+
   if aBits < bBits:
     # if a uses less bits than b,
     # a < b, so q = 0 and r = a
diff --git a/tests/all_tests.nim b/tests/all_tests.nim
index 45b138d..a1fba07 100644
--- a/tests/all_tests.nim
+++ b/tests/all_tests.nim
@@ -15,7 +15,8 @@ import  test_uint_bitops2,
         test_uint_muldiv,
         test_uint_exp,
         test_uint_modular_arithmetic,
-        test_uint_endians2
+        test_uint_endians2,
+        test_randomized_divmod
 
 import  test_int_endianness,
         test_int_comparison,
@@ -26,3 +27,4 @@ import  test_int_endianness,
 
 import  test_io,
         test_conversion
+
diff --git a/tests/property_based.nim b/tests/property_based.nim
deleted file mode 100644
index 640c3f3..0000000
--- a/tests/property_based.nim
+++ /dev/null
@@ -1,238 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ../stint, unittest, quicktest, math
-
-const itercount = 1000
-
-suite "Property-based testing (testing with random inputs) - uint64 on 64-bit / uint32 on 32-bit":
-
-  when defined(release):
-    echo "Testing in release mode with " & $itercount & " random tests for each proc."
-  else:
-    echo "Testing in debug mode " & $itercount & " random tests for each proc. (StUint[64] = 2x uint32)"
-  when defined(stint_test):
-    echo "(StUint[64] = 2x uint32)"
-  else:
-    echo "(StUint[64] = uint64)"
-
-  let hi = 1'u shl (sizeof(uint)*7)
-
-  quicktest "`or`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx or ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx or ty
-
-
-    check(cast[uint](tz) == (x or y))
-
-
-  quicktest "`and`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx and ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx and ty
-
-    check(cast[uint](tz) == (x and y))
-
-  quicktest "`xor`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx xor ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx xor ty
-
-    check(cast[uint](tz) == (x xor y))
-
-  quicktest "`not`", itercount do(x: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = not tx
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = not tx
-
-    check(cast[uint](tz) == (not x))
-
-  quicktest "`<`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx < ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx < ty
-
-    check(tz == (x < y))
-
-
-  quicktest "`<=`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx <= ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx <= ty
-
-    check(tz == (x <= y))
-
-  quicktest "`+`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx + ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx + ty
-
-    check(cast[uint](tz) == x+y)
-
-
-  quicktest "`-`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx - ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx - ty
-
-    check(cast[uint](tz) == x-y)
-
-  quicktest "`*`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx * ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx * ty
-
-    check(cast[uint](tz) == x*y)
-
-  quicktest "`shl`", itercount do(x: uint(min=0, max=hi), y: int(min = 0, max=(sizeof(int)*8-1))):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = tx shl y
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = tx shl y
-
-    check(cast[uint](tz) == x shl y)
-
-  quicktest "`shr`", itercount do(x: uint(min=0, max=hi), y: int(min = 0, max=(sizeof(int)*8-1))):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = tx shr y
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = tx shr y
-
-    check(cast[uint](tz) == x shr y)
-
-  quicktest "`mod`", itercount do(x: uint(min=0, max=hi), y: uint(min = 1, max = hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx mod ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx mod ty
-
-    check(cast[uint](tz) == x mod y)
-
-  quicktest "`div`", itercount do(x: uint(min=0, max=hi), y: uint(min = 1, max = hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx div ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx div ty
-
-    check(cast[uint](tz) == x div y)
-
-  quicktest "pow", itercount do(x: uint(min=0, max=hi), y: int(min = 0, max = high(int))):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = tx.pow(y)
-
-        ty = cast[StUint[64]](y)
-        tz2 = tx.pow(ty)
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = tx.pow(y)
-
-        ty = cast[StUint[32]](y)
-        tz2 = tx.pow(ty)
-
-    check(cast[uint](tz) == x ^ y)
-    check(cast[uint](tz2) == x ^ y)
diff --git a/tests/t_randomized_divmod.nim b/tests/t_randomized_divmod.nim
new file mode 100644
index 0000000..c0e2e71
--- /dev/null
+++ b/tests/t_randomized_divmod.nim
@@ -0,0 +1,46 @@
+# Stint
+# Copyright 2022 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[unittest, times],
+  # Internal
+  ../stint,
+  # Test utilities
+  ../helpers/prng_unsafe
+
+const Iters = 50000
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "t_randomized_divmod xoshiro512** seed: ", seed
+
+proc test_divmod(bits: static int, iters: int, gen: RandomGen) =
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    try:
+      let (q, r) = divmod(a, b)
+      doAssert a == q*b + r
+    except DivByZeroDefect:
+      doAssert b.isZero()
+    
+template test(bits: static int) =
+  test "(q, r) = divmod(a, b) <=> a = q*b + r (" & $bits & " bits)":
+    test_divmod(bits, Iters, Uniform)
+    test_divmod(bits, Iters, HighHammingWeight)
+    test_divmod(bits, Iters, Long01Sequence)
+
+suite "Randomized division and modulo checks":
+  test(128)
+  test(256)
+  test(512)
\ No newline at end of file
diff --git a/tests/property_based_uint256.nim b/tests/test_uint256_ttmath.nim
similarity index 99%
rename from tests/property_based_uint256.nim
rename to tests/test_uint256_ttmath.nim
index d722a06..8203737 100644
--- a/tests/property_based_uint256.nim
+++ b/tests/test_uint256_ttmath.nim
@@ -20,10 +20,6 @@ suite "Property-based testing (testing with random inputs) of Uint256":
     echo "Testing in release mode with " & $itercount & " random tests for each proc."
   else:
     echo "Testing in debug mode " & $itercount & " random tests for each proc. (StUint[64] = 2x uint32)"
-  when defined(stint_test):
-    echo "(StUint[64] = 2x uint32)"
-  else:
-    echo "(StUint[64] = uint64)"
 
   let hi = 1'u shl (sizeof(uint64)*7)
 

From bff69b3f98589a851194045c126d605ef6d00c67 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Mon, 31 Jan 2022 23:30:45 +0100
Subject: [PATCH 21/26] Add randomized testing vs GMP

---
 benchmarks/bench_mod.nim      |  69 ------------
 helpers/prng_unsafe.nim       |   2 +-
 helpers/staticfor.nim         |  29 +++++
 stint.nimble                  |  10 +-
 stint/endians2.nim            |   8 +-
 tests/t_randomized_vs_gmp.nim | 197 ++++++++++++++++++++++++++++++++++
 6 files changed, 234 insertions(+), 81 deletions(-)
 delete mode 100644 benchmarks/bench_mod.nim
 create mode 100644 helpers/staticfor.nim
 create mode 100644 tests/t_randomized_vs_gmp.nim

diff --git a/benchmarks/bench_mod.nim b/benchmarks/bench_mod.nim
deleted file mode 100644
index 2ec97e7..0000000
--- a/benchmarks/bench_mod.nim
+++ /dev/null
@@ -1,69 +0,0 @@
-import ../stint, std/[times, monotimes]
-
-template bench(desc: string, body: untyped) =
-  let start = getMonotime()
-  body
-  let stop = getMonotime()
-  echo desc,": ", inMilliseconds(stop-start), " ms"
-
-# Warmup on normal int to ensure max CPU freq
-# Complex enough that the compiler doesn't optimize it away
-
-proc warmup() =
-  var foo = 123
-  bench "Warmup":
-    for i in 0 ..< 10_000_000:
-      foo += i*i mod 456
-      foo = foo mod 789
-
-warmup()
-####################################
-
-let a = [123'u64, 123'u64, 123'u64, 123'u64]
-let m = [456'u64, 456'u64, 456'u64, 45'u64]
-
-proc mul_stint(a, m: array[4, uint64]) =
-  let aU256 = cast[Stuint[256]](a)
-  let mU256 = cast[Stuint[256]](m)
-
-  bench "Mul (stint)":
-    var foo = aU256
-    for i in 0 ..< 100_000_000:
-      foo += (foo * foo)
-
-proc mod_stint(a, m: array[4, uint64]) =
-  let aU256 = cast[Stuint[256]](a)
-  let mU256 = cast[Stuint[256]](m)
-
-  bench "Mod (stint)":
-    var foo = aU256
-    for i in 0 ..< 100_000_000:
-      foo += (foo * foo) mod mU256
-
-mul_stint(a, m)
-mod_stint(a, m)
-
-when defined(bench_ttmath):
-  # need C++
-  import ttmath, ../tests/ttmath_compat
-
-  proc mul_ttmath(a, m: array[4, uint64]) =
-    let aU256 = a.astt()
-    let mU256 = m.astt()
-
-    bench "Mul (ttmath)":
-      var foo = aU256
-      for i in 0 ..< 100_000_000:
-        foo += (foo * foo)
-
-  proc mod_ttmath(a, m: array[4, uint64]) =
-    let aU256 = a.astt()
-    let mU256 = m.astt()
-
-    bench "Mod (ttmath)":
-      var foo = aU256
-      for i in 0 ..< 100_000_000:
-        foo += (foo * foo) mod mU256
-
-  mul_ttmath(a, m)
-  mod_ttmath(a, m)
\ No newline at end of file
diff --git a/helpers/prng_unsafe.nim b/helpers/prng_unsafe.nim
index dba9758..d977e6d 100644
--- a/helpers/prng_unsafe.nim
+++ b/helpers/prng_unsafe.nim
@@ -172,7 +172,7 @@ func random_long01Seq(rng: var RngState, a: var SomeBigInteger) =
   ## to trigger edge cases
   var buf: array[(a.bits + 7) div 8, byte]
   rng.random_long01Seq(buf)
-  a = (typeof a).fromBytesBE(buf)
+  a = (typeof a).fromBytes(buf, bigEndian)
 
 # Byte sequences
 # ------------------------------------------------------------
diff --git a/helpers/staticfor.nim b/helpers/staticfor.nim
new file mode 100644
index 0000000..95a3864
--- /dev/null
+++ b/helpers/staticfor.nim
@@ -0,0 +1,29 @@
+import std/macros
+
+proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
+  # Replace "what" ident node by "by"
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      if node.eqIdent(what):
+        return by
+      return node
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
+  ## staticFor [min inclusive, max exclusive)
+  result = newStmtList()
+  for i in start ..< stopEx:
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
\ No newline at end of file
diff --git a/stint.nimble b/stint.nimble
index 6f67392..7195f13 100644
--- a/stint.nimble
+++ b/stint.nimble
@@ -1,5 +1,5 @@
 packageName   = "stint"
-version       = "0.0.1"
+version       = "2.0.0"
 author        = "Status Research & Development GmbH"
 description   = "Efficient stack-based multiprecision int in Nim"
 license       = "Apache License 2.0 or MIT"
@@ -9,7 +9,6 @@ skipDirs      = @["tests", "benchmarks"]
 # TODO test only requirements don't work: https://github.com/nim-lang/nimble/issues/482
 requires "nim >= 1.6.0",
          "stew"
- #, "https://github.com/alehander42/nim-quicktest >= 0.18.0", "https://github.com/status-im/nim-ttmath"
 
 proc test(args, path: string) =
   if not dirExists "build":
@@ -34,11 +33,6 @@ task test_uint256_ttmath, "Run random tests Uint256 vs TTMath":
   switch("define", "release")
   test "uint256_ttmath", "cpp"
 
-task test, "Run all tests - test and production implementation":
+task test, "Run all tests":
   exec "nimble test_internal"
   exec "nimble test_public_api"
-  ## TODO test only requirements don't work: https://github.com/nim-lang/nimble/issues/482
-  # exec "nimble test_property_debug"
-  # exec "nimble test_property_release"
-  # exec "nimble test_property_uint256_debug"
-  # exec "nimble test_property_uint256_release"
diff --git a/stint/endians2.nim b/stint/endians2.nim
index ac8796c..0730bea 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -126,7 +126,8 @@ func toBytesBE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] {.
             result[tail-1-i] = toByte(lo shr ((tail-i)*8))
         return
 
-func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian): array[bits div 8, byte] {.inline.} =
+func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = bigEndian): array[bits div 8, byte] {.inline.} =
+  ## Default to bigEndian
   if endian == littleEndian:
     result = x.toBytesLE()
   else:
@@ -238,10 +239,11 @@ func fromBytesLE*[bits: static int](
 func fromBytes*[bits: static int](
     T: typedesc[StUint[bits]],
     x: openarray[byte],
-    srcEndian: Endianness = system.cpuEndian): T {.inline.} =
+    srcEndian: Endianness = bigEndian): T {.inline.} =
   ## Read an source bytearray with the specified endianness and
   ## convert it to an integer
-  when srcEndian == littleEndian:
+  ## Default to bigEndian
+  if srcEndian == littleEndian:
     result = fromBytesLE(T, x)
   else:
     result = fromBytesBE(T, x)
diff --git a/tests/t_randomized_vs_gmp.nim b/tests/t_randomized_vs_gmp.nim
new file mode 100644
index 0000000..7638031
--- /dev/null
+++ b/tests/t_randomized_vs_gmp.nim
@@ -0,0 +1,197 @@
+# Stint
+# Copyright (c) 2018-2022    Status Research & Development GmbH
+# 
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[unittest, times, strutils],
+  # Third-party
+  gmp, stew/byteutils,
+  # Internal
+  ../stint,
+  # Test utilities
+  ../helpers/[prng_unsafe, staticfor]
+
+const
+  Iters = 1000
+  Bitwidths = [128, 256, 512, 1024, 2048]
+
+const # https://gmplib.org/manual/Integer-Import-and-Export.html
+  GMP_WordLittleEndian {.used.} = -1'i32
+  GMP_WordNativeEndian {.used.} = 0'i32
+  GMP_WordBigEndian {.used.} = 1'i32
+
+  GMP_MostSignificantWordFirst = 1'i32
+  GMP_LeastSignificantWordFirst {.used.} = -1'i32
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "t_randomized_vs_gmp xoshiro512** seed: ", seed
+
+proc rawUint(dst: var openArray[byte], src: mpz_t): csize =
+  ## Converts a GMP bigint to a canonical integer as a BigEndian array of byte
+  ## Returns the number of words actually written
+  discard mpz_export(dst[0].addr, result.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, src)
+
+proc fromStuint[bits: static int](dst: var mpz_t, src: Stuint[bits]) =
+  let t = src.toBytes()
+  mpz_import(dst, t.len, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, t[0].addr)
+
+  # Sanity check
+  var t2: typeof(t)
+  let wordsWritten = t2.rawUint(dst)
+  # Note: in bigEndian, GMP aligns left while Stint aligns right
+  doAssert t2.toOpenArray(0, wordsWritten-1) == t.toOpenArray(t.len-wordsWritten, t.len-1)
+
+proc test_add(bits: static int, iters: int, gen: RandomGen) =
+  
+  const N = (bits + 7) div 8
+
+  var x, y, z, m: mpz_t
+  mpz_init(x)
+  mpz_init(y)
+  mpz_init(z)
+  mpz_init(m)
+  mpz_ui_pow_ui(m, 2, bits) # 2^bits
+  
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    x.fromStuint(a)
+    y.fromStuint(b)
+
+    let c = a + b
+    mpz_add(z, x, y)
+    mpz_mod(z, z, m)
+
+    let cBytes = c.toBytes()
+
+    var zBytes: array[N, byte]
+    let wordsWritten = zBytes.rawUint(z)
+
+    # Note: in bigEndian, GMP aligns left while Stint aligns right
+    doAssert zBytes.toOpenArray(0, wordsWritten-1) == cBytes.toOpenArray(N-wordsWritten, N-1), block:
+      # Reexport as bigEndian for debugging
+      var xBuf, yBuf: array[N, byte]
+      discard xBuf.rawUint(x)
+      discard yBuf.rawUint(y)
+      "\nAddition with operands\n" &
+      "  x (" & align($bits, 4) & "-bit):   0x" & xBuf.toHex & "\n" &
+      "  y (" & align($bits, 4) & "-bit):   0x" & yBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            0x" & zBytes.toHex() & "\n" &
+      "  Stint:          0x" & cBytes.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while Stint aligns bytes right)"
+
+template testAddition(bits: static int) =
+  test "Addition vs GMP (" & $bits & " bits)":
+    test_add(bits, Iters, Uniform)
+    test_add(bits, Iters, HighHammingWeight)
+    test_add(bits, Iters, Long01Sequence)
+
+proc test_sub(bits: static int, iters: int, gen: RandomGen) =
+  
+  const N = (bits + 7) div 8
+
+  var x, y, z, m: mpz_t
+  mpz_init(x)
+  mpz_init(y)
+  mpz_init(z)
+  mpz_init(m)
+  mpz_ui_pow_ui(m, 2, bits) # 2^bits
+  
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    x.fromStuint(a)
+    y.fromStuint(b)
+
+    let c = a - b
+    mpz_sub(z, x, y)
+    mpz_mod(z, z, m)
+
+    let cBytes = c.toBytes()
+
+    var zBytes: array[N, byte]
+    let wordsWritten = zBytes.rawUint(z)
+
+    # Note: in bigEndian, GMP aligns left while Stint aligns right
+    doAssert zBytes.toOpenArray(0, wordsWritten-1) == cBytes.toOpenArray(N-wordsWritten, N-1), block:
+      # Reexport as bigEndian for debugging
+      var xBuf, yBuf: array[N, byte]
+      discard xBuf.rawUint(x)
+      discard yBuf.rawUint(y)
+      "\nSubstraction with operands\n" &
+      "  x (" & align($bits, 4) & "-bit):   0x" & xBuf.toHex & "\n" &
+      "  y (" & align($bits, 4) & "-bit):   0x" & yBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            0x" & zBytes.toHex() & "\n" &
+      "  Stint:          0x" & cBytes.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while Stint aligns bytes right)"
+
+template testSubstraction(bits: static int) =
+  test "Substaction vs GMP (" & $bits & " bits)":
+    test_sub(bits, Iters, Uniform)
+    test_sub(bits, Iters, HighHammingWeight)
+    test_sub(bits, Iters, Long01Sequence)
+
+proc test_mul(bits: static int, iters: int, gen: RandomGen) =
+  
+  const N = (bits + 7) div 8
+
+  var x, y, z, m: mpz_t
+  mpz_init(x)
+  mpz_init(y)
+  mpz_init(z)
+  mpz_init(m)
+  mpz_ui_pow_ui(m, 2, bits) # 2^bits
+  
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    x.fromStuint(a)
+    y.fromStuint(b)
+
+    let c = a * b
+    mpz_mul(z, x, y)
+    mpz_mod(z, z, m)
+
+    let cBytes = c.toBytes()
+
+    var zBytes: array[N, byte]
+    let wordsWritten = zBytes.rawUint(z)
+
+    # Note: in bigEndian, GMP aligns left while Stint aligns right
+    doAssert zBytes.toOpenArray(0, wordsWritten-1) == cBytes.toOpenArray(N-wordsWritten, N-1), block:
+      # Reexport as bigEndian for debugging
+      var xBuf, yBuf: array[N, byte]
+      discard xBuf.rawUint(x)
+      discard yBuf.rawUint(y)
+      "\nMultiplication with operands\n" &
+      "  x (" & align($bits, 4) & "-bit):   0x" & xBuf.toHex & "\n" &
+      "  y (" & align($bits, 4) & "-bit):   0x" & yBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            0x" & zBytes.toHex() & "\n" &
+      "  Stint:          0x" & cBytes.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while Stint aligns bytes right)"
+
+template testMultiplication(bits: static int) =
+  test "Multiplication vs GMP (" & $bits & " bits)":
+    test_mul(bits, Iters, Uniform)
+    test_mul(bits, Iters, HighHammingWeight)
+    test_mul(bits, Iters, Long01Sequence)
+
+suite "Randomized arithmetic tests vs GMP":
+  staticFor i, 0, Bitwidths.len:
+    testAddition(Bitwidths[i])
+    testSubstraction(Bitwidths[i])
+    testMultiplication(Bitwidths[i])
\ No newline at end of file

From 63a32129c857a553b9b0be7d1cbfa9919097240c Mon Sep 17 00:00:00 2001
From: jangko <jangko128@gmail.com>
Date: Fri, 9 Jun 2023 15:46:21 +0700
Subject: [PATCH 22/26] rebase and try to make it works with clients

---
 stint.nim                      |  14 ++--
 stint.nimble                   |   4 +-
 stint/endians2.nim             |   4 +-
 stint/intops.nim               |  16 ++--
 stint/io.nim                   | 132 ++++++++++++++++--------------
 stint/literals_stint.nim       |   2 +-
 stint/modular_arithmetic.nim   |   2 +-
 stint/private/datatypes.nim    |  25 ++++--
 stint/private/uint_bitwise.nim |   4 +-
 stint/private/uint_div.nim     | 142 +--------------------------------
 stint/uintops.nim              |   2 +-
 11 files changed, 116 insertions(+), 231 deletions(-)

diff --git a/stint.nim b/stint.nim
index b6c02e7..efd52f3 100644
--- a/stint.nim
+++ b/stint.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-2023 Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -10,12 +10,12 @@
 # import stint/[bitops2, endians2, intops, io, modular_arithmetic, literals_stint]
 # export bitops2, endians2, intops, io, modular_arithmetic, literals_stint
 
-import stint/[io, uintops]
-export io, uintops
+import stint/[io, uintops, intops, literals_stint, modular_arithmetic]
+export io, uintops, intops, literals_stint, modular_arithmetic
 
 type
-  # Int128* = Stint[128]
-  # Int256* = Stint[256]
+  Int128* = Stint[128]
+  Int256* = Stint[256]
   UInt128* = StUint[128]
   UInt256* = StUint[256]
 
@@ -25,8 +25,8 @@ func u128*(s: string): UInt128 {.inline.} = s.parse(UInt128)
 func u256*(n: SomeInteger): UInt256 {.inline.} = n.stuint(256)
 func u256*(s: string): UInt256 {.inline.} = s.parse(UInt256)
 
-# func i128*(n: SomeInteger): Int128 {.inline.} = n.stint(128)
+func i128*(n: SomeInteger): Int128 {.inline.} = n.stint(128)
 # func i128*(s: string): Int128 {.inline.} = s.parse(Int128)
 
-# func i256*(n: SomeInteger): Int256 {.inline.} = n.stint(256)
+func i256*(n: SomeInteger): Int256 {.inline.} = n.stint(256)
 # func i256*(s: string): Int256 {.inline.} = s.parse(Int256)
diff --git a/stint.nimble b/stint.nimble
index 7195f13..e19546c 100644
--- a/stint.nimble
+++ b/stint.nimble
@@ -7,10 +7,10 @@ skipDirs      = @["tests", "benchmarks"]
 ### Dependencies
 
 # TODO test only requirements don't work: https://github.com/nim-lang/nimble/issues/482
-requires "nim >= 1.6.0",
+requires "nim >= 1.6.12",
          "stew"
 
-proc test(args, path: string) =
+proc test(name: string, lang: string = "c") =
   if not dirExists "build":
     mkDir "build"
 
diff --git a/stint/endians2.nim b/stint/endians2.nim
index 0730bea..3614940 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -9,7 +9,7 @@
 
 import private/datatypes
 
-{.push raises: [IndexError], noInit, gcsafe.}
+{.push raises: [IndexDefect], noInit, gcsafe.}
 
 # Serialization
 # ------------------------------------------------------------------------------------------
@@ -138,7 +138,7 @@ func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = bigEndian)
 
 func fromBytesBE*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: openArray[byte]): T =
+    x: openArray[byte]): T {.raises: [], noInit, gcsafe.} =
   ## Read big endian bytes and convert to an integer. At runtime, v must contain
   ## at least sizeof(T) bytes. Native endianess is used which is not
   ## portable! (i.e. use fixed-endian byte array or hex for serialization)
diff --git a/stint/intops.nim b/stint/intops.nim
index fa97d52..90ed0d2 100644
--- a/stint/intops.nim
+++ b/stint/intops.nim
@@ -7,19 +7,18 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ./private/[bitops2_priv, datatypes]
+import ./private/[datatypes]
 
-export StInt, StUint
-export IntImpl, intImpl, UintImpl, uintImpl, bitsof # TODO: remove the need to export those
+export StInt
+#export IntImpl, intImpl, UintImpl, uintImpl, bitsof # TODO: remove the need to export those
 
-type SomeBigInteger = StUint|StInt
+#import ./private/initialization
 
-import ./private/initialization
-
-func zero*[bits: static[int]](T: typedesc[StUint[bits] or StInt[bits]]): T {.inline.} =
+func zero*[bits: static[int]](T: typedesc[StInt[bits]]): T {.inline.} =
   ## Returns the zero of the input type
   discard
-
+  
+#[
 func one*[bits: static[int]](T: typedesc[StUint[bits]]): T {.inline.} =
   ## Returns the one of the input type
   result.data = one(type result.data)
@@ -159,3 +158,4 @@ func pow*(x: StUint, y: StUint): StUint {.inline.} =
     result.data = x.data.pow(y.data)
   else:
     result.data = x.data ^ y.data
+]#
\ No newline at end of file
diff --git a/stint/io.nim b/stint/io.nim
index c80fc4f..c7ddd17 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-2023 Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -21,6 +21,18 @@ import
 
 from stew/byteutils import toHex # Why are we exporting readHexChar in byteutils?
 
+template leastSignificantWord*(a: SomeBigInteger): Word =
+  a.limbs[0]
+
+template mostSignificantWord*(a: SomeBigInteger): Word =
+  a.limbs[^1]
+
+template signedWordType*(_: type SomeBigInteger): type =
+  SignedWord
+
+template wordType*(_: type SomeBigInteger): type =
+  Word
+
 template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
   # To avoid a costly runtime check, we refuse storing into StUint types smaller
   # than the input type.
@@ -62,14 +74,14 @@ func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
 func to*(a: SomeUnsignedInt, T: typedesc[StUint]): T =
   stuint(a, result.bits)
 
-func truncate*(num: StInt or StUint, T: typedesc[SomeInteger]): T {.inline.}=
+func truncate*(num: Stint or StUint, T: typedesc[SomeInteger]): T {.inline.}=
   ## Extract the int, uint, int8-int64 or uint8-uint64 portion of a multi-precision integer.
   ## Note that int and uint are 32-bit on 32-bit platform.
   ## For unsigned result type, result is modulo 2^(sizeof T in bit)
   ## For signed result type, result is undefined if input does not fit in the target type.
   result = T(num.leastSignificantWord())
 
-func toInt*(num: StInt or StUint): int {.inline, deprecated:"Use num.truncate(int) instead".}=
+func toInt*(num: Stint or StUint): int {.inline, deprecated:"Use num.truncate(int) instead".}=
   num.truncate(int)
 
 func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
@@ -79,7 +91,7 @@ func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
   for i in 0 ..< result.len:
     result[i] = a[i]
 
-# func stuint*(a: StInt, bits: static[int]): StUint[bits] {.inline.} =
+# func StUint*(a: StInt, bits: static[int]): StUint[bits] {.inline.} =
 #   ## signed int to unsigned int conversion
 #   ## current behavior is cast-like, copying bit pattern
 #   ## or truncating if input does not fit into destination
@@ -87,12 +99,12 @@ func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
 #   when N < bits:
 #     when N <= 64:
 #       type T = StUint[N]
-#       result = stuint(convert[T](a).data, bits)
+#       result = StUint(convert[T](a).data, bits)
 #     else:
 #       smallToBig(result.data, a.data)
 #   elif N > bits:
 #     when bits <= 64:
-#       result = stuint(x.truncate(type(result.data)), bits)
+#       result = StUint(x.truncate(type(result.data)), bits)
 #     else:
 #       bigToSmall(result.data, a.data)
 #   else:
@@ -143,7 +155,7 @@ func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
 
 # func stint*(a: StUint, bits: static[int]): StInt[bits] {.inline.} =
 #   const N = bitsof(a.data)
-#   const dmax = stuint((type result).high, N)
+#   const dmax = StUint((type result).high, N)
 #   if a > dmax: raise newException(RangeError, "value out of range")
 #   when N < bits:
 #     when N <= 64:
@@ -170,8 +182,6 @@ func readHexChar(c: char): int8 {.inline.}=
 func skipPrefixes(current_idx: var int, str: string, radix: range[2..16]) {.inline.} =
   ## Returns the index of the first meaningful char in `hexStr` by skipping
   ## "0x" prefix
-  # Always called from a context where radix is known at compile-time
-  # and checked within 2..16 and so cannot throw a RangeDefect at runtime
 
   if str.len < 2:
     return
@@ -179,20 +189,14 @@ func skipPrefixes(current_idx: var int, str: string, radix: range[2..16]) {.inli
   doAssert current_idx == 0, "skipPrefixes only works for prefixes (position 0 and 1 of the string)"
   if str[0] == '0':
     if str[1] in {'x', 'X'}:
-      if radix == 16:
-        current_idx = 2
-      else:
-        raise newException(ValueError,"Parsing mismatch, 0x prefix is only valid for a hexadecimal number (base 16)")
+      doAssert radix == 16, "Parsing mismatch, 0x prefix is only valid for a hexadecimal number (base 16)"
+      current_idx = 2
     elif str[1] in {'o', 'O'}:
-      if radix == 8:
-        current_idx = 2
-      else:
-        raise newException(ValueError, "Parsing mismatch, 0o prefix is only valid for an octal number (base 8)")
+      doAssert radix == 8, "Parsing mismatch, 0o prefix is only valid for an octal number (base 8)"
+      current_idx = 2
     elif str[1] in {'b', 'B'}:
-      if radix == 2:
-        current_idx = 2
-      elif radix != 16:
-        raise newException(ValueError, "Parsing mismatch, 0b prefix is only valid for a binary number (base 2) or as first byte of a hexadecimal number (base 16)")
+      doAssert radix == 2, "Parsing mismatch, 0b prefix is only valid for a binary number (base 2)"
+      current_idx = 2
 
 func nextNonBlank(current_idx: var int, s: string) {.inline.} =
   ## Move the current index, skipping white spaces and "_" characters.
@@ -203,15 +207,13 @@ func nextNonBlank(current_idx: var int, s: string) {.inline.} =
   while current_idx < s.len and s[current_idx] in blanks:
     inc current_idx
 
-func readDecChar(c: char): int {.inline.}=
+func readDecChar(c: range['0'..'9']): int {.inline.}=
   ## Converts a decimal char to an int
   # specialization without branching for base <= 10.
-  if c notin {'0'..'9'}:
-    raise newException(ValueError, "Character out of '0'..'9' range")
   ord(c) - ord('0')
 
 func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix: static[uint8] = 10): T =
-  ## Parse a string and store the result in a StInt[bits] or StUint[bits].
+  ## Parse a string and store the result in a Stint[bits] or StUint[bits].
 
   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
@@ -232,7 +234,7 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
     nextNonBlank(curr, input)
 
 # func parse*[bits: static[int]](input: string, T: typedesc[Stint[bits]], radix: static[int8] = 10): T =
-#   ## Parse a string and store the result in a Stint[bits] or Stuint[bits].
+#   ## Parse a string and store the result in a Stint[bits] or StUint[bits].
 
 #   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
 #   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
@@ -241,12 +243,12 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
 #   #       and be much faster
 
 #   # For conversion we require overflowing operations (for example for negative hex numbers)
-#   const base = radix.int8.stuint(bits)
+#   const base = radix.int8.StUint(bits)
 
 #   var
 #     curr = 0 # Current index in the string
 #     isNeg = false
-#     no_overflow: Stuint[bits]
+#     no_overflow: StUint[bits]
 
 #   if input[curr] == '-':
 #     doAssert radix == 10, "Negative numbers are only supported with base 10 input."
@@ -258,9 +260,9 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
 #   while curr < input.len:
 #     # TODO: overflow detection
 #     when radix <= 10:
-#       no_overflow = no_overflow * base + input[curr].readDecChar.stuint(bits)
+#       no_overflow = no_overflow * base + input[curr].readDecChar.StUint(bits)
 #     else:
-#       no_overflow = no_overflow * base + input[curr].readHexChar.stuint(bits)
+#       no_overflow = no_overflow * base + input[curr].readHexChar.StUint(bits)
 #     nextNonBlank(curr, input)
 
 #   # TODO: we can't create the lowest int this way
@@ -269,7 +271,7 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
 #   else:
 #     result = convert[T](no_overflow)
 
-func fromHex*(T: typedesc[StUint|StInt], s: string): T {.inline.} =
+func fromHex*(T: typedesc[StUint|Stint], s: string): T {.inline.} =
   ## Convert an hex string to the corresponding unsigned integer
   parse(s, type result, radix = 16)
 
@@ -277,34 +279,34 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
   ## Convert an hex string to the corresponding unsigned integer
   parse(hexString, type result, radix = 16)
 
-# func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10): string =
-#   ## Convert a Stint or Stuint to string.
-#   ## In case of negative numbers:
-#   ##   - they are prefixed with "-" for base 10.
-#   ##   - if not base 10, they are returned raw in two-complement form.
+func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10): string =
+  ## Convert a Stint or StUint to string.
+  ## In case of negative numbers:
+  ##   - they are prefixed with "-" for base 10.
+  ##   - if not base 10, they are returned raw in two-complement form.
 
-#   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
-#   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
+  static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
+  # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
 
-#   const hexChars = "0123456789abcdef"
-#   const base = radix.uint8.stuint(bits)
+  const hexChars = "0123456789abcdef"
+  const base = radix.uint8.stuint(bits)
 
-#   result = ""
-#   var (q, r) = divmod(num, base)
+  result = ""
+  var (q, r) = divmod(num, base)
 
-#   while true:
-#     when bitsof(r.data) <= 64:
-#       result.add hexChars[r.data.int]
-#     else:
-#       result.add hexChars[r.truncate(int)]
-#     if q.isZero:
-#       break
-#     (q, r) = divmod(q, base)
+  while true:
+    when bits <= 64:
+      result.add hexChars[r.leastSignificantWord()]
+    else:
+      result.add hexChars[r.truncate(int)]
+    if q.isZero:
+      break
+    (q, r) = divmod(q, base)
 
-#   reverse(result)
+  reverse(result)
 
 # func toString*[bits: static[int]](num: Stint[bits], radix: static[int8] = 10): string =
-#   ## Convert a Stint or Stuint to string.
+#   ## Convert a Stint or StUint to string.
 #   ## In case of negative numbers:
 #   ##   - they are prefixed with "-" for base 10.
 #   ##   - if not base 10, they are returned raw in two-complement form.
@@ -313,11 +315,11 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
 #   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
 
 #   const hexChars = "0123456789abcdef"
-#   const base = radix.int8.stuint(bits)
+#   const base = radix.int8.StUint(bits)
 
 #   result = ""
 
-#   type T = Stuint[bits]
+#   type T = StUint[bits]
 #   let isNeg = num.isNegative
 #   let num = convert[T](if radix == 10 and isNeg: -num
 #             else: num)
@@ -344,11 +346,11 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
 #   else:
 #     toString(num, 10)
 
-# func toHex*[bits: static[int]](num: Stint[bits] or StUint[bits]): string {.inline.}=
-#   ## Convert to a hex string.
-#   ## Output is considered a big-endian base 16 string.
-#   ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
-#   toString(num, 16)
+func toHex*[bits: static[int]](num: Stint[bits] or StUint[bits]): string {.inline.}=
+  ## Convert to a hex string.
+  ## Output is considered a big-endian base 16 string.
+  ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
+  toString(num, 16)
 
 func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string =
   ## Stringify an int to hex.
@@ -365,7 +367,9 @@ func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string
   let bytes = a.toBytes(order)
   result = bytes.toHex()
 
-func readUintBE*[bits: static[int]](ba: openarray[byte]): Stuint[bits] {.noInit, inline.}=
+export fromBytes, toBytes
+
+func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] {.noInit, inline.}=
   ## Convert a big-endian array of (bits div 8) Bytes to an UInt[bits] (in native host endianness)
   ## Input:
   ##   - a big-endian openArray of size (bits div 8) at least
@@ -386,3 +390,11 @@ template hash*(num: StUint|StInt): Hash =
   # `hashData` is not particularly efficient.
   # Explore better hashing solutions in nim-stew.
   hashData(unsafeAddr num, sizeof num)
+
+func fromBytesBE*(T: type StUint, ba: openArray[byte], allowPadding: static[bool] = true): T {.noInit, inline.}=
+  result = readUintBE[T.bits](ba)
+  when allowPadding:
+    result = result shl ((sizeof(T) - ba.len) * 8)
+
+template initFromBytesBE*(x: var StUint, ba: openArray[byte], allowPadding: static[bool] = true) =
+  x = fromBytesBE(type x, ba, allowPadding)
diff --git a/stint/literals_stint.nim b/stint/literals_stint.nim
index e09e1f8..40a8431 100644
--- a/stint/literals_stint.nim
+++ b/stint/literals_stint.nim
@@ -9,7 +9,7 @@
 
 ## This file provides syntactic sugar to work with literals
 
-import ./intops, macros
+import ./intops, ./uintops, macros
 
 type Signedness = enum
   BothSigned, IntOnly, UintOnly
diff --git a/stint/modular_arithmetic.nim b/stint/modular_arithmetic.nim
index 4eff437..d7f2914 100644
--- a/stint/modular_arithmetic.nim
+++ b/stint/modular_arithmetic.nim
@@ -7,7 +7,7 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ./intops, private/datatypes
+import ./uintops, private/datatypes
 
 func addmod_internal(a, b, m: StUint): StUint {.inline.}=
   ## Modular addition
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 1792304..5c04439 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -12,9 +12,13 @@ import
   stew/bitops2
 
 when sizeof(int) == 8 and not defined(Stint32):
-  type Word* = uint64
+  type
+    Word* = uint64
+    SignedWord* = int64
 else:
-  type Word* = uint32
+  type
+    Word* = uint32
+    SignedWord* = int32
 
 const WordBitWidth* = sizeof(Word) * 8
 
@@ -33,10 +37,19 @@ type
     limbs*: array[bits.wordsRequired, Word]
       # Limbs-Endianess is little-endian
 
-  StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
-    ## Stack-based integer
-    ## Signed
+when (NimMajor, NimMinor) < (1,9):
+  type
+    StInt*[bits: static[int]] = object
+      ## Stack-based integer
+      ## Signed
+      limbs*: array[bits.wordsRequired, Word]
+else:
+  type
+    StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
+      ## Stack-based integer
+      ## Signed
 
+type
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
 
@@ -136,4 +149,4 @@ func copyWords*(
   for i in countdown(numWords-1, 0):
     a[startA+i] = b[startB+i]
 
-{.pop.}
\ No newline at end of file
+{.pop.}
diff --git a/stint/private/uint_bitwise.nim b/stint/private/uint_bitwise.nim
index a3ce42b..598d88d 100644
--- a/stint/private/uint_bitwise.nim
+++ b/stint/private/uint_bitwise.nim
@@ -20,7 +20,7 @@ import
 func bitnot*(r: var StUint, a: Stuint) =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
-  for i in 0 ..< r.len:
+  for i in 0 ..< r.limbs.len:
     r[i] = not a[i]
   r.clearExtraBitsOverMSB()
 
@@ -56,7 +56,7 @@ func leadingZeros*(a: Stuint): int =
   # Adjust when we use only part of the word size
   var extraBits = WordBitWidth * a.limbs.len - a.bits
 
-  for i in countdown(a.len-1, 0):
+  for i in countdown(a.limbs.len-1, 0):
     let zeroCount = a.limbs[i].leadingZeros()
     if extraBits > 0:
       result += zeroCount - min(extraBits, WordBitWidth)
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index 960d02e..c0a0798 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -54,146 +54,6 @@ func shlAddMod_multi(a: var openArray[Word], c: Word,
   ## Does a <- a * W + c (mod M)
   ## and returns q = (a * W + c ) / M
   ##
-<<<<<<< HEAD
-  ## For now only LittleEndian is implemented
-  #
-  # Resources at the bottom of the file
-
-  const
-    qLen = q.limbs.len
-    rLen = r.limbs.len
-    uLen = u.limbs.len
-    vLen = v.limbs.len
-
-  template `[]`(a: Stuint, i: int): Word = a.limbs[i]
-  template `[]=`(a: Stuint, i: int, val: Word) = a.limbs[i] = val
-
-  # Find the most significant word with actual set bits
-  # and get the leading zero count there
-  var divisorLen = vLen
-  var clz: int
-  for w in mostToLeastSig(v):
-    if w != 0:
-      clz = leadingZeros(w)
-      break
-    else:
-      divisorLen -= 1
-
-  doAssert divisorLen != 0, "Division by zero. Abandon ship!"
-
-  # Divisor is a single word.
-  if divisorLen == 1:
-    q.copyFrom(u)
-    r.leastSignificantWord() = q.limbs.shortDiv(v.leastSignificantWord())
-    # zero all but the least significant word
-    var lsw = true
-    for w in leastToMostSig(r):
-      if lsw:
-        lsw = false
-      else:
-        w = 0
-    return
-
-  var un {.noInit.}: Limbs[uLen+1]
-  var vn {.noInit.}: Limbs[vLen] # [mswLen .. vLen] range is unused
-
-  # Normalize so that the divisor MSB is set,
-  # vn cannot overflow, un can overflowed by 1 word at most, hence uLen+1
-  un.shlSmallOverflowing(u.limbs, clz)
-  vn.shlSmall(v.limbs, clz)
-
-  static: doAssert cpuEndian == littleEndian, "Currently the division algorithm requires little endian ordering of the limbs"
-  # TODO: is it worth it to have the uint be the exact same extended precision representation
-  # as a wide int (say uint128 or uint256)?
-  # in big-endian, the following loop must go the other way and the -1 must be +1
-
-  let vhi = vn[divisorLen-1]
-  let vlo = vn[divisorLen-2]
-
-  for j in countdown(uLen - divisorLen, 0, 1):
-    # Compute qhat estimate of q[j] (off by 0, 1 and rarely 2)
-    var qhat, rhat: Word
-    let uhi = un[j+divisorLen]
-    let ulo = un[j+divisorLen-1]
-    div2n1n(qhat, rhat, uhi, ulo, vhi)
-    var mhi, mlo: Word
-    var rhi, rlo: Word
-    mul(mhi, mlo, qhat, vlo)
-    rhi = rhat
-    rlo = ulo
-
-    # if r < m, adjust approximation, up to twice
-    while rhi < mhi or (rhi == mhi and rlo < mlo):
-      qhat -= 1
-      rhi += vhi
-
-    # Found the quotient
-    q[j] = qhat
-
-    # un -= qhat * v
-    var borrow = Borrow(0)
-    var qvhi, qvlo: Word
-    for i in 0 ..< divisorLen-1:
-      mul(qvhi, qvlo, qhat, v[i])
-      subB(borrow, un[j+i], un[j+i], qvlo, borrow)
-      subB(borrow, un[j+i+1], un[j+i+1], qvhi, borrow)
-    # Last step
-    mul(qvhi, qvlo, qhat, v[divisorLen-1])
-    subB(borrow, un[j+divisorLen-1], un[j+divisorLen-1], qvlo, borrow)
-    qvhi += Word(borrow)
-    let isNeg = un[j+divisorLen] < qvhi
-    un[j+divisorLen] -= qvhi
-
-    if isNeg:
-      # oops, too big by one, add back
-      q[j] -= 1
-      var carry = Carry(0)
-      for i in 0 ..< divisorLen:
-        addC(carry, un[j+i], un[j+i], v[i], carry)
-
-  # Quotient is found, if remainder is needed we need to un-normalize un
-  if needRemainder:
-    # r.limbs.shrSmall(un, clz) - TODO
-    when cpuEndian == littleEndian:
-      # rLen+1 == un.len
-      for i in 0 ..< rLen:
-        r[i] = (un[i] shr clz) or (un[i+1] shl (WordBitWidth - clz))
-    else:
-      {.error: "Not Implemented for bigEndian".}
-
-
-const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
-                                # binary shift is probably faster
-
-func divmod(q, r: var Stuint,
-            x, y: Stuint, needRemainder: bool) =
-
-  let x_clz = x.leadingZeros()
-  let y_clz = y.leadingZeros()
-
-  # We short-circuit division depending on special-cases.
-  if unlikely(y.isZero()):
-    raise newException(DivByZeroError, "You attempted to divide by zero")
-  elif y_clz == (y.bits - 1):
-    # y is one
-    q = x
-  # elif (x.hi or y.hi).isZero:
-  #   # If computing just on the low part is enough
-  #   (result.quot.lo, result.rem.lo) = divmod(x.lo, y.lo, needRemainder)
-  # elif (y and (y - one(type y))).isZero:
-  #   # y is a power of 2. (this also matches 0 but it was eliminated earlier)
-  #   # TODO. Would it be faster to use countTrailingZero (ctz) + clz == size(y) - 1?
-  #   #       Especially because we shift by ctz after.
-  #   let y_ctz = bitsof(y) - y_clz - 1
-  #   result.quot = x shr y_ctz
-  #   if needRemainder:
-  #     result.rem = x and (y - one(type y))
-  elif x == y:
-    q.setOne()
-  elif x < y:
-    r = x
-  # elif (y_clz - x_clz) < BinaryShiftThreshold:
-  #   binaryShiftDiv(x, y, result.quot, result.rem)
   ## The modulus `M` most-significant bit at `mBits` MUST be set.
   
                                         # Assuming 64-bit words
@@ -369,7 +229,7 @@ func divRem*(
 # - An Efficient Multiple-Precision Division Algorithm,
 #   Liusheng Huang, Hong Zhong, Hong Shen, Yonglong Luo, 2005
 #   https://ieeexplore.ieee.org/document/1579076
-#
+# 
 # - Efficient multiple-precision integer division algorithm
 #   Debapriyay Mukhopadhyaya, Subhas C.Nandy, 2014
 #   https://www.sciencedirect.com/science/article/abs/pii/S0020019013002627
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 681616f..3c406ef 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -46,7 +46,7 @@ func one*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
   result.setOne()
 
 func high*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
-  for i in 0 ..< result.len:
+  for i in 0 ..< result.limbs.len:
     result[i] = high(Word)
 
 func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =

From 0dc6afe9d431659da813b25b4ad1332b7fcef36e Mon Sep 17 00:00:00 2001
From: jangko <jangko128@gmail.com>
Date: Mon, 12 Jun 2023 21:07:15 +0700
Subject: [PATCH 23/26] let the tests compileable and run

---
 stint.nim                                     |  4 +-
 stint.nimble                                  |  6 +-
 stint/endians2.nim                            |  6 +-
 stint/io.nim                                  | 16 ++--
 stint/private/datatypes.nim                   |  6 +-
 .../primitives/compiletime_fallback.nim       |  6 +-
 .../private/primitives/extended_precision.nim |  2 +-
 .../extended_precision_64bit_uint128.nim      |  8 +-
 stint/private/uint_addsub.nim                 | 16 ++--
 stint/private/uint_bitwise.nim                | 20 ++---
 stint/private/uint_shift.nim                  |  8 +-
 stint/uintops.nim                             | 68 ++++++++---------
 tests/all_tests.nim                           | 24 +++---
 tests/test_uint_addsub.nim                    | 18 +++--
 tests/test_uint_bitwise.nim                   | 26 ++++---
 tests/test_uint_comparison.nim                | 75 ++++++++++---------
 tests/test_uint_divmod.nim                    | 14 ++--
 tests/test_uint_endians2.nim                  | 38 +++++-----
 tests/test_uint_exp.nim                       | 10 ++-
 tests/test_uint_modular_arithmetic.nim        | 18 +++--
 tests/test_uint_mul.nim                       |  8 +-
 21 files changed, 206 insertions(+), 191 deletions(-)

diff --git a/stint.nim b/stint.nim
index efd52f3..64f5a42 100644
--- a/stint.nim
+++ b/stint.nim
@@ -14,8 +14,8 @@ import stint/[io, uintops, intops, literals_stint, modular_arithmetic]
 export io, uintops, intops, literals_stint, modular_arithmetic
 
 type
-  Int128* = Stint[128]
-  Int256* = Stint[256]
+  Int128* = StInt[128]
+  Int256* = StInt[256]
   UInt128* = StUint[128]
   UInt256* = StUint[256]
 
diff --git a/stint.nimble b/stint.nimble
index e19546c..b7fb0a5 100644
--- a/stint.nimble
+++ b/stint.nimble
@@ -10,7 +10,7 @@ skipDirs      = @["tests", "benchmarks"]
 requires "nim >= 1.6.12",
          "stew"
 
-proc test(name: string, lang: string = "c") =
+proc test(args, path: string) =
   if not dirExists "build":
     mkDir "build"
 
@@ -23,10 +23,10 @@ proc test(name: string, lang: string = "c") =
       " --styleCheck:usages --styleCheck:error " & path
 
 task test_internal, "Run tests for internal procs":
-  test "internal"
+  test "", "tests/internal"
 
 task test_public_api, "Run all tests - prod implementation (StUint[64] = uint64":
-  test "all_tests"
+  test "", "tests/all_tests"
 
 task test_uint256_ttmath, "Run random tests Uint256 vs TTMath":
   requires "https://github.com/alehander42/nim-quicktest >= 0.18.0", "https://github.com/status-im/nim-ttmath"
diff --git a/stint/endians2.nim b/stint/endians2.nim
index 3614940..6bc2990 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -9,7 +9,7 @@
 
 import private/datatypes
 
-{.push raises: [IndexDefect], noInit, gcsafe.}
+{.push raises: [IndexDefect], noinit, gcsafe.}
 
 # Serialization
 # ------------------------------------------------------------------------------------------
@@ -138,7 +138,7 @@ func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = bigEndian)
 
 func fromBytesBE*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: openArray[byte]): T {.raises: [], noInit, gcsafe.} =
+    x: openArray[byte]): T {.raises: [], noinit, gcsafe.} =
   ## Read big endian bytes and convert to an integer. At runtime, v must contain
   ## at least sizeof(T) bytes. Native endianess is used which is not
   ## portable! (i.e. use fixed-endian byte array or hex for serialization)
@@ -238,7 +238,7 @@ func fromBytesLE*[bits: static int](
 
 func fromBytes*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: openarray[byte],
+    x: openArray[byte],
     srcEndian: Endianness = bigEndian): T {.inline.} =
   ## Read an source bytearray with the specified endianness and
   ## convert it to an integer
diff --git a/stint/io.nim b/stint/io.nim
index c7ddd17..6120ab4 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -74,14 +74,14 @@ func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
 func to*(a: SomeUnsignedInt, T: typedesc[StUint]): T =
   stuint(a, result.bits)
 
-func truncate*(num: Stint or StUint, T: typedesc[SomeInteger]): T {.inline.}=
+func truncate*(num: StInt or StUint, T: typedesc[SomeInteger]): T {.inline.}=
   ## Extract the int, uint, int8-int64 or uint8-uint64 portion of a multi-precision integer.
   ## Note that int and uint are 32-bit on 32-bit platform.
   ## For unsigned result type, result is modulo 2^(sizeof T in bit)
   ## For signed result type, result is undefined if input does not fit in the target type.
   result = T(num.leastSignificantWord())
 
-func toInt*(num: Stint or StUint): int {.inline, deprecated:"Use num.truncate(int) instead".}=
+func toInt*(num: StInt or StUint): int {.inline, deprecated:"Use num.truncate(int) instead".}=
   num.truncate(int)
 
 func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
@@ -271,7 +271,7 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
 #   else:
 #     result = convert[T](no_overflow)
 
-func fromHex*(T: typedesc[StUint|Stint], s: string): T {.inline.} =
+func fromHex*(T: typedesc[StUint|StInt], s: string): T {.inline.} =
   ## Convert an hex string to the corresponding unsigned integer
   parse(s, type result, radix = 16)
 
@@ -346,13 +346,13 @@ func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10):
 #   else:
 #     toString(num, 10)
 
-func toHex*[bits: static[int]](num: Stint[bits] or StUint[bits]): string {.inline.}=
+func toHex*[bits: static[int]](num: StInt[bits] or StUint[bits]): string {.inline.}=
   ## Convert to a hex string.
   ## Output is considered a big-endian base 16 string.
   ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
   toString(num, 16)
 
-func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string =
+func dumpHex*(a: StInt or StUint, order: static[Endianness] = bigEndian): string =
   ## Stringify an int to hex.
   ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
   ##
@@ -369,7 +369,7 @@ func dumpHex*(a: Stint or StUint, order: static[Endianness] = bigEndian): string
 
 export fromBytes, toBytes
 
-func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] {.noInit, inline.}=
+func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] {.noinit, inline.}=
   ## Convert a big-endian array of (bits div 8) Bytes to an UInt[bits] (in native host endianness)
   ## Input:
   ##   - a big-endian openArray of size (bits div 8) at least
@@ -377,7 +377,7 @@ func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] {.noInit,
   ##   - A unsigned integer of the same size with `bits` bits
   result = (typeof result).fromBytesBE(ba)
 
-func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] {.noInit, inline.}=
+func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] {.noinit, inline.}=
   ## Convert a uint[bits] to to a big-endian array of bits div 8 bytes
   ## Input:
   ##   - an unsigned integer
@@ -391,7 +391,7 @@ template hash*(num: StUint|StInt): Hash =
   # Explore better hashing solutions in nim-stew.
   hashData(unsafeAddr num, sizeof num)
 
-func fromBytesBE*(T: type StUint, ba: openArray[byte], allowPadding: static[bool] = true): T {.noInit, inline.}=
+func fromBytesBE*(T: type StUint, ba: openArray[byte], allowPadding: static[bool] = true): T {.noinit, inline.}=
   result = readUintBE[T.bits](ba)
   when allowPadding:
     result = result shl ((sizeof(T) - ba.len) * 8)
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 5c04439..47bbf23 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -53,7 +53,7 @@ type
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
 
-  SomeBigInteger*[bits: static[int]] = Stuint[bits]|Stint[bits]
+  SomeBigInteger*[bits: static[int]] = StUint[bits] | StInt[bits]
 
 const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
 const X86* = defined(amd64) or defined(i386)
@@ -65,7 +65,7 @@ when sizeof(int) == 8 and GCC_Compatible:
 # Bithacks
 # --------------------------------------------------------
 
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
 template clearExtraBitsOverMSB*(a: var StUint) =
   ## A Stuint is stored in an array of 32 of 64-bit word
@@ -138,7 +138,7 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
 
 # Copy
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
 func copyWords*(
        a: var openArray[Word], startA: int,
diff --git a/stint/private/primitives/compiletime_fallback.nim b/stint/private/primitives/compiletime_fallback.nim
index 051cf86..daf65c7 100644
--- a/stint/private/primitives/compiletime_fallback.nim
+++ b/stint/private/primitives/compiletime_fallback.nim
@@ -109,7 +109,7 @@ func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
   addC_nim(carry2, hi, hi, 0, carry2)
 
 
-func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
+func div2n1n_nim*[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T) =
   ## Division uint128 by uint64
   ## Warning ⚠️ :
   ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
@@ -141,8 +141,8 @@ func div2n1n_nim*[T: SomeunsignedInt](q, r: var T, n_hi, n_lo, d: T) =
   let
     d_hi = d shr halfSize
     d_lo = d and halfMask
-    n_lohi = nlo shr halfSize
-    n_lolo = nlo and halfMask
+    n_lohi = n_lo shr halfSize
+    n_lolo = n_lo and halfMask
 
   # First half of the quotient
   let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim
index 9d795fd..cd04828 100644
--- a/stint/private/primitives/extended_precision.nim
+++ b/stint/private/primitives/extended_precision.nim
@@ -77,7 +77,7 @@ when sizeof(int) == 8 and not defined(Stint32):
 
   when defined(vcc):
     from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
-  elif GCCCompatible:
+  elif GCC_Compatible:
     when X86:
       from ./extended_precision_x86_64_gcc import div2n1n_128
       from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
diff --git a/stint/private/primitives/extended_precision_64bit_uint128.nim b/stint/private/primitives/extended_precision_64bit_uint128.nim
index 321e4da..3289ce0 100644
--- a/stint/private/primitives/extended_precision_64bit_uint128.nim
+++ b/stint/private/primitives/extended_precision_64bit_uint128.nim
@@ -24,7 +24,7 @@ func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
   ## Warning ⚠️ :
   ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
   ##   - if n_hi > d result is undefined
-  var dblPrec {.noInit.}: uint128
+  var dblPrec {.noinit.}: uint128
   {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
 
   # Don't forget to dereference the var param in C mode
@@ -39,7 +39,7 @@ func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
   ## Extended precision multiplication
   ## (hi, lo) <- a*b
   block:
-    var dblPrec {.noInit.}: uint128
+    var dblPrec {.noinit.}: uint128
     {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].}
 
     # Don't forget to dereference the var param in C mode
@@ -60,7 +60,7 @@ func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
   ## This is constant-time on most hardware
   ## See: https://www.bearssl.org/ctmul.html
   block:
-    var dblPrec {.noInit.}: uint128
+    var dblPrec {.noinit.}: uint128
     {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
 
     # Don't forget to dereference the var param in C mode
@@ -80,7 +80,7 @@ func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
   ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
   ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
   block:
-    var dblPrec {.noInit.}: uint128
+    var dblPrec {.noinit.}: uint128
     {.emit:[
       dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,
                " + (unsigned __int128)",c1," + (unsigned __int128)",c2,";"
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index 3cd4909..f5e4882 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -14,44 +14,44 @@ import
 
 # Addsub
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func sum*(r: var Stuint, a, b: Stuint) =
+func sum*(r: var StUint, a, b: StUint) =
   ## Addition for multi-precision unsigned int
   var carry = Carry(0)
   for i in 0 ..< r.limbs.len:
     addC(carry, r[i], a[i], b[i], carry)
   r.clearExtraBitsOverMSB()
 
-func `+=`*(a: var Stuint, b: Stuint) =
+func `+=`*(a: var StUint, b: StUint) =
   ## In-place addition for multi-precision unsigned int
   a.sum(a, b)
 
-func diff*(r: var Stuint, a, b: Stuint) =
+func diff*(r: var StUint, a, b: StUint) =
   ## Substraction for multi-precision unsigned int
   var borrow = Borrow(0)
   for i in 0 ..< r.limbs.len:
     subB(borrow, r[i], a[i], b[i], borrow)
   r.clearExtraBitsOverMSB()
 
-func `-=`*(a: var Stuint, b: Stuint) =
+func `-=`*(a: var StUint, b: StUint) =
   ## In-place substraction for multi-precision unsigned int
   a.diff(a, b)
 
-func inc*(a: var Stuint, w: Word = 1) =
+func inc*(a: var StUint, w: Word = 1) =
   var carry = Carry(0)
   addC(carry, a.limbs[0], a.limbs[0], w, carry)
   for i in 1 ..< a.limbs.len:
     addC(carry, a.limbs[i], a.limbs[i], 0, carry)
   a.clearExtraBitsOverMSB()
 
-func sum*(r: var Stuint, a: Stuint, b: SomeUnsignedInt) =
+func sum*(r: var StUint, a: StUint, b: SomeUnsignedInt) =
   ## Addition for multi-precision unsigned int
   ## with an unsigned integer
   r = a
   r.inc(Word(b))
 
-func `+=`*(a: var Stuint, b: SomeUnsignedInt) =
+func `+=`*(a: var StUint, b: SomeUnsignedInt) =
   ## In-place addition for multi-precision unsigned int
   ## with an unsigned integer
   a.inc(Word(b))
diff --git a/stint/private/uint_bitwise.nim b/stint/private/uint_bitwise.nim
index 598d88d..0085a62 100644
--- a/stint/private/uint_bitwise.nim
+++ b/stint/private/uint_bitwise.nim
@@ -15,42 +15,42 @@ import
 
 # Bitwise operations
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func bitnot*(r: var StUint, a: Stuint) =
+func bitnot*(r: var StUint, a: StUint) =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
   for i in 0 ..< r.limbs.len:
     r[i] = not a[i]
   r.clearExtraBitsOverMSB()
 
-func bitor*(r: var Stuint, a, b: Stuint) =
+func bitor*(r: var StUint, a, b: StUint) =
   ## `Bitwise or` of numbers a and b
   for i in 0 ..< r.limbs.len:
     r[i] = a[i] or b[i]
 
-func bitand*(r: var Stuint, a, b: Stuint) =
+func bitand*(r: var StUint, a, b: StUint) =
   ## `Bitwise and` of numbers a and b
   for i in 0 ..< r.limbs.len:
     r[i] = a[i] and b[i]
 
-func bitxor*(r: var Stuint, a, b: Stuint) =
+func bitxor*(r: var StUint, a, b: StUint) =
   ## `Bitwise xor` of numbers x and y
   for i in 0 ..< r.limbs.len:
     r[i] = a[i] xor b[i]
   r.clearExtraBitsOverMSB()
 
-func countOnes*(a: Stuint): int =
+func countOnes*(a: StUint): int =
   result = 0
   for i in 0 ..< a.limbs.len:
     result += countOnes(a[i])
 
-func parity*(a: Stuint): int =
+func parity*(a: StUint): int =
   result = parity(a.limbs[0])
   for i in 1 ..< a.limbs.len:
     result = result xor parity(a.limbs[i])
 
-func leadingZeros*(a: Stuint): int =
+func leadingZeros*(a: StUint): int =
   result = 0
 
   # Adjust when we use only part of the word size
@@ -66,7 +66,7 @@ func leadingZeros*(a: Stuint): int =
     if zeroCount != WordBitWidth:
       break
 
-func trailingZeros*(a: Stuint): int =
+func trailingZeros*(a: StUint): int =
   result = 0
   for i in 0 ..< a.limbs.len:
     let zeroCount = a[i].trailingZeros()
@@ -78,7 +78,7 @@ func trailingZeros*(a: Stuint): int =
     if result > a.bits:
       result = a.bits
 
-func firstOne*(a: Stuint): int =
+func firstOne*(a: StUint): int =
   result = trailingZeros(a)
   if result == a.limbs.len * WordBitWidth:
     result = 0
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
index a0181c2..093db7a 100644
--- a/stint/private/uint_shift.nim
+++ b/stint/private/uint_shift.nim
@@ -74,12 +74,12 @@ func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
 # Wrappers
 # --------------------------------------------------------
 
-func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
+func shiftRight*(r: var StUint, a: StUint, k: SomeInteger) =
   ## Shift `a` right by k bits and store in `r`
   if k == 0:
     r = a
     return
-  
+
   if k < WordBitWidth:
     r.limbs.shrSmall(a.limbs, k)
     return
@@ -93,12 +93,12 @@ func shiftRight*(r: var Stuint, a: Stuint, k: SomeInteger) =
   else:
     r.limbs.shrLarge(a.limbs, w, shift)
 
-func shiftLeft*(r: var Stuint, a: Stuint, k: SomeInteger) =
+func shiftLeft*(r: var StUint, a: StUint, k: SomeInteger) =
   ## Shift `a` left by k bits and store in `r`
   if k == 0:
     r = a
     return
-  
+
   if k < WordBitWidth:
     r.limbs.shlSmall(a.limbs, k)
     r.clearExtraBitsOverMSB()
diff --git a/stint/uintops.nim b/stint/uintops.nim
index 3c406ef..bedccce 100644
--- a/stint/uintops.nim
+++ b/stint/uintops.nim
@@ -21,7 +21,7 @@ export StUint
 
 # Initialization
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
 func setZero*(a: var StUint) =
   ## Set ``a`` to 0
@@ -37,40 +37,40 @@ func setSmallInt(a: var StUint, k: Word) =
 func setOne*(a: var StUint) =
   setSmallInt(a, 1)
 
-func zero*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
+func zero*[bits: static[int]](T: typedesc[StUint[bits]]): T {.inline.} =
   ## Returns the zero of the input type
   discard
 
-func one*[bits: static[int]](T: typedesc[Stuint[bits]]): T {.inline.} =
+func one*[bits: static[int]](T: typedesc[StUint[bits]]): T {.inline.} =
   ## Returns the one of the input type
   result.setOne()
 
-func high*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
+func high*[bits](_: typedesc[StUint[bits]]): StUint[bits] {.inline.} =
   for i in 0 ..< result.limbs.len:
     result[i] = high(Word)
 
-func low*[bits](_: typedesc[Stuint[bits]]): Stuint[bits] {.inline.} =
+func low*[bits](_: typedesc[StUint[bits]]): StUint[bits] {.inline.} =
   discard
 
 {.pop.}
 # Comparisons
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func isZero*(a: Stuint): bool =
+func isZero*(a: StUint): bool =
   for i in 0 ..< a.limbs.len:
     if a[i] != 0:
       return false
   return true
 
-func `==`*(a, b: Stuint): bool {.inline.} =
+func `==`*(a, b: StUint): bool {.inline.} =
   ## Unsigned `equal` comparison
   for i in 0 ..< a.limbs.len:
     if a[i] != b[i]:
       return false
   return true
 
-func `<`*(a, b: Stuint): bool {.inline.} =
+func `<`*(a, b: StUint): bool {.inline.} =
   ## Unsigned `less than` comparison
   var diff: Word
   var borrow: Borrow
@@ -78,16 +78,16 @@ func `<`*(a, b: Stuint): bool {.inline.} =
     subB(borrow, diff, a[i], b[i], borrow)
   return bool(borrow)
 
-func `<=`*(a, b: Stuint): bool {.inline.} =
+func `<=`*(a, b: StUint): bool {.inline.} =
   ## Unsigned `less or equal` comparison
   not(b < a)
 
-func isOdd*(a: Stuint): bool {.inline.} =
+func isOdd*(a: StUint): bool {.inline.} =
   ## Returns true if input is off
   ## false otherwise
   bool(a[0] and 1)
 
-func isEven*(a: Stuint): bool {.inline.} =
+func isEven*(a: StUint): bool {.inline.} =
   ## Returns true if input is zero
   ## false otherwise
   not a.isOdd()
@@ -95,22 +95,22 @@ func isEven*(a: Stuint): bool {.inline.} =
 {.pop.}
 # Bitwise operations
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func `not`*(a: Stuint): Stuint =
+func `not`*(a: StUint): StUint =
   ## Bitwise complement of unsigned integer a
   ## i.e. flips all bits of the input
   result.bitnot(a)
 
-func `or`*(a, b: Stuint): Stuint =
+func `or`*(a, b: StUint): StUint =
   ## `Bitwise or` of numbers a and b
   result.bitor(a, b)
 
-func `and`*(a, b: Stuint): Stuint =
+func `and`*(a, b: StUint): StUint =
   ## `Bitwise and` of numbers a and b
   result.bitand(a, b)
 
-func `xor`*(a, b: Stuint): Stuint =
+func `xor`*(a, b: StUint): StUint =
   ## `Bitwise xor` of numbers x and y
   result.bitxor(a, b)
 
@@ -125,11 +125,11 @@ export
 
 {.push raises: [], inline, gcsafe.}
 
-func `shr`*(a: Stuint, k: SomeInteger): Stuint =
+func `shr`*(a: StUint, k: SomeInteger): StUint =
   ## Shift right by k bits
   result.shiftRight(a, k)
 
-func `shl`*(a: Stuint, k: SomeInteger): Stuint =
+func `shl`*(a: StUint, k: SomeInteger): StUint =
   ## Shift left by k bits
   result.shiftLeft(a, k)
 
@@ -137,15 +137,15 @@ func `shl`*(a: Stuint, k: SomeInteger): Stuint =
 
 # Addsub
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func `+`*(a, b: Stuint): Stuint =
+func `+`*(a, b: StUint): StUint =
   ## Addition for multi-precision unsigned int
   result.sum(a, b)
 
 export `+=`
 
-func `-`*(a, b: Stuint): Stuint =
+func `-`*(a, b: StUint): StUint =
   ## Substraction for multi-precision unsigned int
   result.diff(a, b)
 
@@ -153,7 +153,7 @@ export `-=`
 
 export inc
 
-func `+`*(a: Stuint, b: SomeUnsignedInt): Stuint =
+func `+`*(a: StUint, b: SomeUnsignedInt): StUint =
   ## Addition for multi-precision unsigned int
   ## with an unsigned integer
   result.sum(a, Word(b))
@@ -169,9 +169,9 @@ export `+=`
 # - It's implemented at the limb-level so that
 #   in the future Stuint[254] and Stuint256] share a common codepath
 
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func `*`*(a, b: Stuint): Stuint =
+func `*`*(a, b: StUint): StUint =
   ## Integer multiplication
   result.limbs.prod(a.limbs, b.limbs)
   result.clearExtraBitsOverMSB()
@@ -181,9 +181,9 @@ func `*`*(a, b: Stuint): Stuint =
 # Exponentiation
 # --------------------------------------------------------
 
-{.push raises: [], noInit, gcsafe.}
+{.push raises: [], noinit, gcsafe.}
 
-func pow*(a: Stuint, e: Natural): Stuint =
+func pow*(a: StUint, e: Natural): StUint =
   ## Compute ``a`` to the power of ``e``,
   ## ``e`` must be non-negative
 
@@ -202,7 +202,7 @@ func pow*(a: Stuint, e: Natural): Stuint =
       break
     a = a * a
 
-func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
+func pow*[aBits, eBits](a: StUint[aBits], e: StUint[eBits]): StUint[aBits] =
   ## Compute ``x`` to the power of ``y``,
   ## ``x`` must be non-negative
   # Implementation uses exponentiation by squaring
@@ -224,19 +224,19 @@ func pow*[aBits, eBits](a: Stuint[aBits], e: Stuint[eBits]): Stuint[aBits] =
 
 # Division & Modulo
 # --------------------------------------------------------
-{.push raises: [], inline, noInit, gcsafe.}
+{.push raises: [], inline, noinit, gcsafe.}
 
-func `div`*(x, y: Stuint): Stuint =
+func `div`*(x, y: StUint): StUint =
   ## Division operation for multi-precision unsigned uint
-  var tmp{.noInit.}: Stuint
+  var tmp{.noinit.}: StUint
   divRem(result.limbs, tmp.limbs, x.limbs, y.limbs)
 
-func `mod`*(x, y: Stuint): Stuint =
+func `mod`*(x, y: StUint): StUint =
   ## Remainder operation for multi-precision unsigned uint
-  var tmp{.noInit.}: Stuint
+  var tmp{.noinit.}: StUint
   divRem(tmp.limbs, result.limbs, x.limbs, y.limbs)
 
-func divmod*(x, y: Stuint): tuple[quot, rem: Stuint] =
+func divmod*(x, y: StUint): tuple[quot, rem: StUint] =
   ## Division and remainder operations for multi-precision unsigned uint
   divRem(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
 
diff --git a/tests/all_tests.nim b/tests/all_tests.nim
index a1fba07..b6ba60c 100644
--- a/tests/all_tests.nim
+++ b/tests/all_tests.nim
@@ -7,17 +7,19 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  test_uint_bitops2,
-        test_uint_endianness,
-        test_uint_comparison,
-        test_uint_bitwise,
-        test_uint_addsub,
-        test_uint_muldiv,
-        test_uint_exp,
-        test_uint_modular_arithmetic,
-        test_uint_endians2,
-        test_randomized_divmod
+import  
+  test_uint_addsub,
+  test_uint_bitops2,
+  test_uint_bitwise,
+  test_uint_comparison,
+  #test_uint_divmod,
+  test_uint_endianness,
+  test_uint_endians2,        
+  test_uint_exp,
+  #test_uint_modular_arithmetic,
+  test_uint_mul
 
+#[
 import  test_int_endianness,
         test_int_comparison,
         test_int_addsub,
@@ -27,4 +29,4 @@ import  test_int_endianness,
 
 import  test_io,
         test_conversion
-
+]#
diff --git a/tests/test_uint_addsub.nim b/tests/test_uint_addsub.nim
index b78eafc..c5c09bf 100644
--- a/tests/test_uint_addsub.nim
+++ b/tests/test_uint_addsub.nim
@@ -35,7 +35,7 @@ template chkInplaceSubstraction(chk, a, b, c, bits: untyped) =
 
 template testAddSub(chk, tst: untyped) =
   tst "addition":
-    chkAddition(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkAddition(chk, 0'u8, 0'u8, 0'u8, 8)
     chkAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 8)
     chkAddition(chk, low(uint8), 17'u8, low(uint8) + 17'u8, 8)
 
@@ -61,7 +61,7 @@ template testAddSub(chk, tst: untyped) =
     chkAddition(chk, high(uint32) - 17'u32, 17'u32, high(uint32), 64)
     chkAddition(chk, low(uint32), 17'u32, low(uint32) + 17'u32, 64)
     chkAddition(chk, high(uint64) - 17'u64, 17'u64, high(uint64), 64)
-    chkAddition(chk, low(uint64), 17'u64, low(uint64) + 17'u64, 64)
+    chkAddition(chk, low(uint64), 17'u64, low(uint64) + 17'u64, 64)]#
 
     chkAddition(chk, 0'u8, 0'u8, 0'u8, 128)
     chkAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 128)
@@ -74,7 +74,7 @@ template testAddSub(chk, tst: untyped) =
     chkAddition(chk, low(uint64), 17'u64, low(uint64) + 17'u64, 128)
 
   tst "inplace addition":
-    chkInplaceAddition(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkInplaceAddition(chk, 0'u8, 0'u8, 0'u8, 8)
     chkInplaceAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 8)
     chkInplaceAddition(chk, low(uint8) + 17'u8, 17'u8, low(uint8) + 34'u8, 8)
 
@@ -100,7 +100,7 @@ template testAddSub(chk, tst: untyped) =
     chkInplaceAddition(chk, high(uint32) - 17'u32, 17'u32, high(uint32), 64)
     chkInplaceAddition(chk, low(uint32) + 17'u32, 17'u32, low(uint32) + 34'u32, 64)
     chkInplaceAddition(chk, high(uint64) - 17'u64, 17'u64, high(uint64), 64)
-    chkInplaceAddition(chk, low(uint64) + 17'u64, 17'u64, low(uint64) + 34'u64, 64)
+    chkInplaceAddition(chk, low(uint64) + 17'u64, 17'u64, low(uint64) + 34'u64, 64)]#
 
     chkInplaceAddition(chk, 0'u8, 0'u8, 0'u8, 128)
     chkInplaceAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 128)
@@ -113,7 +113,7 @@ template testAddSub(chk, tst: untyped) =
     chkInplaceAddition(chk, low(uint64) + 17'u64, 17'u64, low(uint64) + 34'u64, 128)
 
   tst "substraction":
-    chkSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
     chkSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 8)
     chkSubstraction(chk, low(uint8) + 17'u8, 17'u8, low(uint8), 8)
 
@@ -139,7 +139,7 @@ template testAddSub(chk, tst: untyped) =
     chkSubstraction(chk, high(uint32) - 17'u32, 17'u32, high(uint32) - 34'u32, 64)
     chkSubstraction(chk, low(uint32) + 17'u32, 17'u32, low(uint32), 64)
     chkSubstraction(chk, high(uint64) - 17'u64, 17'u64, high(uint64) - 34'u64, 64)
-    chkSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)
+    chkSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)]#
 
     chkSubstraction(chk, 0'u8, 0'u8, 0'u8, 128)
     chkSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 128)
@@ -152,7 +152,7 @@ template testAddSub(chk, tst: untyped) =
     chkSubstraction(chk, high(uint64), high(uint64), 0'u64, 128)
 
   tst "inplace substraction":
-    chkInplaceSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkInplaceSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
     chkInplaceSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 8)
     chkInplaceSubstraction(chk, low(uint8) + 17'u8, 17'u8, low(uint8), 8)
 
@@ -178,7 +178,7 @@ template testAddSub(chk, tst: untyped) =
     chkInplaceSubstraction(chk, high(uint32) - 17'u32, 17'u32, high(uint32) - 34'u32, 64)
     chkInplaceSubstraction(chk, low(uint32) + 17'u32, 17'u32, low(uint32), 64)
     chkInplaceSubstraction(chk, high(uint64) - 17'u64, 17'u64, high(uint64) - 34'u64, 64)
-    chkInplaceSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)
+    chkInplaceSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)]#
 
     chkInplaceSubstraction(chk, 0'u8, 0'u8, 0'u8, 128)
     chkInplaceSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 128)
@@ -196,6 +196,7 @@ static:
 suite "Wider unsigned int addsub coverage":
   testAddSub(check, test)
 
+#[
 suite "Testing unsigned int addition implementation":
   test "In-place addition gives expected result":
 
@@ -261,3 +262,4 @@ suite "Testing unsigned int substraction implementation":
     let b = 101'u16.stuint(16)
 
     check: cast[uint16](a-b) == high(uint16)
+]#
diff --git a/tests/test_uint_bitwise.nim b/tests/test_uint_bitwise.nim
index 369c807..8e72fec 100644
--- a/tests/test_uint_bitwise.nim
+++ b/tests/test_uint_bitwise.nim
@@ -47,7 +47,7 @@ template testBitwise(chk, tst: untyped) =
   #chkShr(chk, "F0000000000000000000000000000000", 128, "00", 128)
 
   tst "operator `not`":
-    chkNot(chk, 0'u8, not 0'u8, 8)
+    #[chkNot(chk, 0'u8, not 0'u8, 8)
     chkNot(chk, high(uint8), not high(uint8), 8)
     chkNot(chk, "F0", "0F", 8)
     chkNot(chk, "0F", "F0", 8)
@@ -83,7 +83,7 @@ template testBitwise(chk, tst: untyped) =
     chkNot(chk, high(uint8), not uint64(high(uint8)), 64)
     chkNot(chk, high(uint16), not uint64(high(uint16)), 64)
     chkNot(chk, high(uint32), not uint64(high(uint32)), 64)
-    chkNot(chk, high(uint64), not high(uint64), 64)
+    chkNot(chk, high(uint64), not high(uint64), 64)]#
 
     chkNot(chk, "0", "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 128)
     chkNot(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "0", 128)
@@ -91,7 +91,7 @@ template testBitwise(chk, tst: untyped) =
     chkNot(chk, "FFFFFFFFFFFF00000000000000000000", "000000000000FFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `or`":
-    chkOr(chk, "00", "FF", "FF", 8)
+    #[chkOr(chk, "00", "FF", "FF", 8)
     chkOr(chk, "FF", "00", "FF", 8)
     chkOr(chk, "F0", "0F", "FF", 8)
     chkOr(chk, "00", "00", "00", 8)
@@ -114,7 +114,7 @@ template testBitwise(chk, tst: untyped) =
     chkOr(chk, "F0", "0F", "00000000000000FF", 64)
     chkOr(chk, "00", "00", "0000000000000000", 64)
     chkOr(chk, "FF00", "0F00", "000000000000FF00", 64)
-    chkOr(chk, "00FF00FF", "000F000F", "0000000000FF00FF", 64)
+    chkOr(chk, "00FF00FF", "000F000F", "0000000000FF00FF", 64)]#
 
     chkOr(chk, "00", "FF", "000000000000000000000000000000FF", 128)
     chkOr(chk, "FF", "00", "000000000000000000000000000000FF", 128)
@@ -125,7 +125,7 @@ template testBitwise(chk, tst: untyped) =
     chkOr(chk, "00000000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", 128)
 
   tst "operator `and`":
-    chkAnd(chk, "00", "FF", "00", 8)
+    #[chkAnd(chk, "00", "FF", "00", 8)
     chkAnd(chk, "FF", "00", "00", 8)
     chkAnd(chk, "F0", "0F", "00", 8)
     chkAnd(chk, "00", "00", "00", 8)
@@ -150,7 +150,7 @@ template testBitwise(chk, tst: untyped) =
     chkAnd(chk, "F0", "0F", "0000000000000000", 64)
     chkAnd(chk, "00", "00", "0000000000000000", 64)
     chkAnd(chk, "FF00", "0F00", "0000000000000F00", 64)
-    chkAnd(chk, "00FF00FF", "000F000F", "00000000000F000F", 64)
+    chkAnd(chk, "00FF00FF", "000F000F", "00000000000F000F", 64)]#
 
     chkAnd(chk, "00", "FF", "00000000000000000000000000000000", 128)
     chkAnd(chk, "FF", "00", "00000000000000000000000000000000", 128)
@@ -161,7 +161,7 @@ template testBitwise(chk, tst: untyped) =
     chkAnd(chk, "F0000000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", "F0000000000000000000000000FF00FF", 128)
 
   tst "operator `xor`":
-    chkXor(chk, "00", "FF", "FF", 8)
+    #[chkXor(chk, "00", "FF", "FF", 8)
     chkXor(chk, "FF", "00", "FF", 8)
     chkXor(chk, "F0", "0F", "FF", 8)
     chkXor(chk, "00", "00", "00", 8)
@@ -186,7 +186,7 @@ template testBitwise(chk, tst: untyped) =
     chkXor(chk, "F0", "0F", "00000000000000FF", 64)
     chkXor(chk, "00", "00", "0000000000000000", 64)
     chkXor(chk, "FF00", "0F00", "000000000000F000", 64)
-    chkXor(chk, "00FF00FF", "000F000F", "0000000000F000F0", 64)
+    chkXor(chk, "00FF00FF", "000F000F", "0000000000F000F0", 64)]#
 
     chkXor(chk, "00", "FF", "000000000000000000000000000000FF", 128)
     chkXor(chk, "FF", "00", "000000000000000000000000000000FF", 128)
@@ -197,7 +197,7 @@ template testBitwise(chk, tst: untyped) =
     chkXor(chk, "F0000000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", "0F0F0000000000000000000000000000", 128)
 
   tst "operator `shl`":
-    chkShl(chk, "0F", 4, "F0", 8)
+    #[chkShl(chk, "0F", 4, "F0", 8)
     chkShl(chk, "F0", 4, "00", 8)
     chkShl(chk, "F0", 3, "80", 8)
     chkShl(chk, "0F", 7, "80", 8)
@@ -226,7 +226,7 @@ template testBitwise(chk, tst: untyped) =
     chkShl(chk, "0F", 5, "1E0", 64)
     chkShl(chk, "0F", 9, "1E00", 64)
     chkShl(chk, "0F", 17, "1E0000", 64)
-    chkShl(chk, "0F", 33, "1E00000000", 64)
+    chkShl(chk, "0F", 33, "1E00000000", 64)]#
 
     chkShl(chk, "0F", 4, "F0", 128)
     chkShl(chk, "F0", 4, "F00", 128)
@@ -257,7 +257,7 @@ template testBitwise(chk, tst: untyped) =
     chkShl(chk, "0F", 255, "8000000000000000000000000000000000000000000000000000000000000000", 256)
 
   tst "operator `shr`":
-    chkShr(chk, "0F", 4, "00", 8)
+    #[chkShr(chk, "0F", 4, "00", 8)
     chkShr(chk, "F0", 4, "0F", 8)
     chkShr(chk, "F0", 3, "1E", 8)
     chkShr(chk, "F0", 7, "01", 8)
@@ -278,7 +278,7 @@ template testBitwise(chk, tst: untyped) =
     chkShr(chk, "F0", 3, "1E", 64)
     chkShr(chk, "F000", 3, "1E00", 64)
     chkShr(chk, "F0000000", 3, "1E000000", 64)
-    chkShr(chk, "F000000000000000", 63, "0000000000000001", 64)
+    chkShr(chk, "F000000000000000", 63, "0000000000000001", 64)]#
 
     chkShr(chk, "0F", 4, "00", 128)
     chkShr(chk, "F0", 4, "0F", 128)
@@ -311,6 +311,7 @@ static:
 suite "Wider unsigned int bitwise coverage":
   testBitwise(check, test)
 
+#[
 suite "Testing unsigned int bitwise operations":
   let a = 100'i16.stuint(16)
 
@@ -348,3 +349,4 @@ suite "Testing unsigned int bitwise operations":
   test "Shift right - by half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shr 8) == z shr 8
+]#
\ No newline at end of file
diff --git a/tests/test_uint_comparison.nim b/tests/test_uint_comparison.nim
index ad002fe..39d850e 100644
--- a/tests/test_uint_comparison.nim
+++ b/tests/test_uint_comparison.nim
@@ -47,7 +47,7 @@ template chkNotIsEven(chk: untyped, a: string, bits: int) =
 
 template testComparison(chk, tst: untyped) =
   tst "operator `LT`":
-    chkLT(chk, "0", "F", 8)
+    #[chkLT(chk, "0", "F", 8)
     chkLT(chk, "F", "FF", 8)
 
     chkLT(chk, "0", "F", 16)
@@ -63,7 +63,7 @@ template testComparison(chk, tst: untyped) =
     chkLT(chk, "F", "FF", 64)
     chkLT(chk, "FF", "FFF", 64)
     chkLT(chk, "FFFF", "FFFFF", 64)
-    chkLT(chk, "FFFFF", "FFFFFFFF", 64)
+    chkLT(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkLT(chk, "0", "F", 128)
     chkLT(chk, "F", "FF", 128)
@@ -73,7 +73,7 @@ template testComparison(chk, tst: untyped) =
     chkLT(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator not `LT`":
-    chkNotLT(chk, "0", "F", 8)
+    #[chkNotLT(chk, "0", "F", 8)
     chkNotLT(chk, "F", "FF", 8)
 
     chkNotLT(chk, "0", "F", 16)
@@ -89,7 +89,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLT(chk, "F", "FF", 64)
     chkNotLT(chk, "FF", "FFF", 64)
     chkNotLT(chk, "FFFF", "FFFFF", 64)
-    chkNotLT(chk, "FFFFF", "FFFFFFFF", 64)
+    chkNotLT(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkNotLT(chk, "0", "F", 128)
     chkNotLT(chk, "F", "FF", 128)
@@ -99,7 +99,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLT(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `LTE`":
-    chkLTE(chk, "0", "F", 8)
+    #[chkLTE(chk, "0", "F", 8)
     chkLTE(chk, "F", "FF", 8)
     chkLTE(chk, "F", "F", 8)
 
@@ -119,7 +119,7 @@ template testComparison(chk, tst: untyped) =
     chkLTE(chk, "FF", "FFF", 64)
     chkLTE(chk, "FFFF", "FFFFF", 64)
     chkLTE(chk, "FFFFF", "FFFFFFFF", 64)
-    chkLTE(chk, "FFFFFFFF", "FFFFFFFF", 64)
+    chkLTE(chk, "FFFFFFFF", "FFFFFFFF", 64)]#
 
     chkLTE(chk, "0", "F", 128)
     chkLTE(chk, "F", "FF", 128)
@@ -130,7 +130,7 @@ template testComparison(chk, tst: untyped) =
     chkLTE(chk, "FFFFFFFFFFFFFFFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator not `LTE`":
-    chkNotLTE(chk, "0", "F", 8)
+    #[chkNotLTE(chk, "0", "F", 8)
     chkNotLTE(chk, "F", "FF", 8)
 
     chkNotLTE(chk, "0", "F", 16)
@@ -146,7 +146,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLTE(chk, "F", "FF", 64)
     chkNotLTE(chk, "FF", "FFF", 64)
     chkNotLTE(chk, "FFFF", "FFFFF", 64)
-    chkNotLTE(chk, "FFFFF", "FFFFFFFF", 64)
+    chkNotLTE(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkNotLTE(chk, "0", "F", 128)
     chkNotLTE(chk, "F", "FF", 128)
@@ -156,7 +156,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLTE(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `EQ`":
-    chkEQ(chk, "0", "0", 8)
+    #[chkEQ(chk, "0", "0", 8)
     chkEQ(chk, "FF", "FF", 8)
     chkEQ(chk, "F", "F", 8)
 
@@ -176,7 +176,7 @@ template testComparison(chk, tst: untyped) =
     chkEQ(chk, "FF", "FF", 64)
     chkEQ(chk, "FFFF", "FFFF", 64)
     chkEQ(chk, "FFFFF", "FFFFF", 64)
-    chkEQ(chk, "FFFFFFFF", "FFFFFFFF", 64)
+    chkEQ(chk, "FFFFFFFF", "FFFFFFFF", 64)]#
 
     chkEQ(chk, "0", "0", 128)
     chkEQ(chk, "F", "F", 128)
@@ -186,7 +186,7 @@ template testComparison(chk, tst: untyped) =
     chkEQ(chk, "FFFFFFFFFFFFFFFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator not `EQ`":
-    chkNotEQ(chk, "0", "F", 8)
+    #[chkNotEQ(chk, "0", "F", 8)
     chkNotEQ(chk, "F", "FF", 8)
 
     chkNotEQ(chk, "0", "F", 16)
@@ -202,7 +202,7 @@ template testComparison(chk, tst: untyped) =
     chkNotEQ(chk, "F", "FF", 64)
     chkNotEQ(chk, "FF", "FFF", 64)
     chkNotEQ(chk, "FFFF", "FFFFF", 64)
-    chkNotEQ(chk, "FFFFF", "FFFFFFFF", 64)
+    chkNotEQ(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkNotEQ(chk, "0", "F", 128)
     chkNotEQ(chk, "F", "FF", 128)
@@ -212,92 +212,92 @@ template testComparison(chk, tst: untyped) =
     chkNotEQ(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `isZero`":
-    chkIsZero(chk, "0", 8)
+    #[chkIsZero(chk, "0", 8)
     chkIsZero(chk, "0", 16)
     chkIsZero(chk, "0", 32)
-    chkIsZero(chk, "0", 64)
+    chkIsZero(chk, "0", 64)]#
     chkIsZero(chk, "0", 128)
     chkIsZero(chk, "0", 256)
 
   tst "operator not `isZero`":
-    chkNotIsZero(chk, "1", 8)
+    #[chkNotIsZero(chk, "1", 8)
     chkNotIsZero(chk, "2", 16)
     chkNotIsZero(chk, "3", 32)
-    chkNotIsZero(chk, "4", 64)
+    chkNotIsZero(chk, "4", 64)]#
     chkNotIsZero(chk, "5", 128)
     chkNotIsZero(chk, "6", 256)
 
   tst "operator `isOdd`":
-    chkIsOdd(chk, "1", 8)
+    #[chkIsOdd(chk, "1", 8)
     chkIsOdd(chk, "1", 16)
     chkIsOdd(chk, "1", 32)
-    chkIsOdd(chk, "1", 64)
+    chkIsOdd(chk, "1", 64)]#
     chkIsOdd(chk, "1", 128)
     chkIsOdd(chk, "1", 256)
 
-    chkIsOdd(chk, "FF", 8)
+    #[chkIsOdd(chk, "FF", 8)
     chkIsOdd(chk, "FFF", 16)
     chkIsOdd(chk, "FFFFF", 32)
-    chkIsOdd(chk, "FFFFFF", 64)
+    chkIsOdd(chk, "FFFFFF", 64)]#
     chkIsOdd(chk, "FFFFFFFFFFFFFFF", 128)
     chkIsOdd(chk, "FFFFFFFFFFFFFFFFFF", 256)
 
   tst "operator not `isOdd`":
-    chkNotIsOdd(chk, "0", 8)
+    #[chkNotIsOdd(chk, "0", 8)
     chkNotIsOdd(chk, "0", 16)
     chkNotIsOdd(chk, "0", 32)
-    chkNotIsOdd(chk, "0", 64)
+    chkNotIsOdd(chk, "0", 64)]#
     chkNotIsOdd(chk, "0", 128)
     chkNotIsOdd(chk, "0", 256)
 
-    chkNotIsOdd(chk, "4", 8)
+    #[chkNotIsOdd(chk, "4", 8)
     chkNotIsOdd(chk, "4", 16)
     chkNotIsOdd(chk, "4", 32)
-    chkNotIsOdd(chk, "4", 64)
+    chkNotIsOdd(chk, "4", 64)]#
     chkNotIsOdd(chk, "4", 128)
     chkNotIsOdd(chk, "4", 256)
 
-    chkNotIsOdd(chk, "A", 8)
+    #[chkNotIsOdd(chk, "A", 8)
     chkNotIsOdd(chk, "AAA", 16)
     chkNotIsOdd(chk, "AAAA", 32)
-    chkNotIsOdd(chk, "FFFFFA", 64)
+    chkNotIsOdd(chk, "FFFFFA", 64)]#
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFA", 128)
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFFFFA", 256)
 
   tst "operator `isEven`":
-    chkNotIsOdd(chk, "0", 8)
+    #[chkNotIsOdd(chk, "0", 8)
     chkNotIsOdd(chk, "0", 16)
     chkNotIsOdd(chk, "0", 32)
-    chkNotIsOdd(chk, "0", 64)
+    chkNotIsOdd(chk, "0", 64)]#
     chkNotIsOdd(chk, "0", 128)
     chkNotIsOdd(chk, "0", 256)
 
-    chkNotIsOdd(chk, "4", 8)
+    #[chkNotIsOdd(chk, "4", 8)
     chkNotIsOdd(chk, "4", 16)
     chkNotIsOdd(chk, "4", 32)
-    chkNotIsOdd(chk, "4", 64)
+    chkNotIsOdd(chk, "4", 64)]#
     chkNotIsOdd(chk, "4", 128)
     chkNotIsOdd(chk, "4", 256)
 
-    chkNotIsOdd(chk, "A", 8)
+    #[chkNotIsOdd(chk, "A", 8)
     chkNotIsOdd(chk, "AAA", 16)
     chkNotIsOdd(chk, "AAAA", 32)
-    chkNotIsOdd(chk, "FFFFFA", 64)
+    chkNotIsOdd(chk, "FFFFFA", 64)]#
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFA", 128)
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFFFFA", 256)
 
   tst "operator not `isEven`":
-    chkIsOdd(chk, "1", 8)
+    #[chkIsOdd(chk, "1", 8)
     chkIsOdd(chk, "1", 16)
     chkIsOdd(chk, "1", 32)
-    chkIsOdd(chk, "1", 64)
+    chkIsOdd(chk, "1", 64)]#
     chkIsOdd(chk, "1", 128)
     chkIsOdd(chk, "1", 256)
 
-    chkIsOdd(chk, "FF", 8)
+    #[chkIsOdd(chk, "FF", 8)
     chkIsOdd(chk, "FFF", 16)
     chkIsOdd(chk, "FFFFF", 32)
-    chkIsOdd(chk, "FFFFFF", 64)
+    chkIsOdd(chk, "FFFFFF", 64)]#
     chkIsOdd(chk, "FFFFFFFFFFFFFFF", 128)
     chkIsOdd(chk, "FFFFFFFFFFFFFFFFFF", 256)
 
@@ -307,6 +307,7 @@ static:
 suite "Wider unsigned int comparison coverage":
   testComparison(check, test)
 
+#[
 suite "Testing unsigned int comparison operators":
   let
     a = 10'i16.stuint(16)
@@ -359,4 +360,4 @@ suite "Testing unsigned int comparison operators":
       b.isOdd
       not b.isEven
       # c.isEven
-      # not c.isOdd
+      # not c.isOdd]#
diff --git a/tests/test_uint_divmod.nim b/tests/test_uint_divmod.nim
index 0cb5002..4dae110 100644
--- a/tests/test_uint_divmod.nim
+++ b/tests/test_uint_divmod.nim
@@ -20,7 +20,7 @@ template chkDivMod(chk: untyped, a, b, c, d: string, bits: int) =
 
 template testdivmod(chk, tst: untyped) =
   tst "operator `div`":
-    chkDiv(chk, "0", "3", "0", 8)
+    #[chkDiv(chk, "0", "3", "0", 8)
     chkDiv(chk, "1", "3", "0", 8)
     chkDiv(chk, "3", "3", "1", 8)
     chkDiv(chk, "3", "1", "3", 8)
@@ -48,7 +48,7 @@ template testdivmod(chk, tst: untyped) =
     chkDiv(chk, "FF", "3", "55", 64)
     chkDiv(chk, "FFFF", "3", "5555", 64)
     chkDiv(chk, "FFFFFFFF", "3", "55555555", 64)
-    chkDiv(chk, "FFFFFFFFFFFFFFFF", "3", "5555555555555555", 64)
+    chkDiv(chk, "FFFFFFFFFFFFFFFF", "3", "5555555555555555", 64)]#
 
     chkDiv(chk, "0", "3", "0", 128)
     chkDiv(chk, "1", "3", "0", 128)
@@ -61,7 +61,7 @@ template testdivmod(chk, tst: untyped) =
     chkDiv(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "3", "55555555555555555555555555555555", 128)
 
   tst "operator `mod`":
-    chkMod(chk, "0", "3", "0", 8)
+    #[chkMod(chk, "0", "3", "0", 8)
     chkMod(chk, "1", "3", "1", 8)
     chkMod(chk, "3", "3", "0", 8)
     chkMod(chk, "3", "1", "0", 8)
@@ -101,7 +101,7 @@ template testdivmod(chk, tst: untyped) =
     chkMod(chk, "FFFFFFFF", "3", "0", 64)
     chkMod(chk, "FFFFFFFF", "23", "A", 64)
     chkMod(chk, "FFFFFFFF", "27", "15", 64)
-    chkMod(chk, "FFFFFFFFFFFFFFFF", "27", "F", 64)
+    chkMod(chk, "FFFFFFFFFFFFFFFF", "27", "F", 64)]#
 
     chkMod(chk, "0", "3", "0", 128)
     chkMod(chk, "1", "3", "1", 128)
@@ -118,7 +118,7 @@ template testdivmod(chk, tst: untyped) =
     chkMod(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "27", "15", 128)
 
   tst "operator `divmod`":
-    chkDivMod(chk, "0", "3", "0", "0", 8)
+    #[chkDivMod(chk, "0", "3", "0", "0", 8)
     chkDivMod(chk, "1", "3", "0", "1", 8)
     chkDivMod(chk, "3", "3", "1", "0", 8)
     chkDivMod(chk, "3", "1", "3", "0", 8)
@@ -158,7 +158,7 @@ template testdivmod(chk, tst: untyped) =
     chkDivMod(chk, "FFFFFFFF", "3", "55555555", "0", 64)
     chkDivMod(chk, "FFFFFFFF", "23", "7507507", "0A", 64)
     chkDivMod(chk, "FFFFFFFF", "27", "6906906", "15", 64)
-    chkDivMod(chk, "FFFFFFFFFFFFFFFF", "27", "690690690690690", "F", 64)
+    chkDivMod(chk, "FFFFFFFFFFFFFFFF", "27", "690690690690690", "F", 64)]#
 
     chkDivMod(chk, "0", "3", "0", "0", 128)
     chkDivMod(chk, "1", "3", "0", "1", 128)
@@ -180,6 +180,7 @@ static:
 suite "Wider unsigned int muldiv coverage":
   testdivmod(check, test)
 
+#[
 suite "Testing unsigned int division and modulo implementation":
   test "Divmod(100, 13) returns the correct result":
 
@@ -243,3 +244,4 @@ suite "Testing specific failures highlighted by property-based testing":
     let tz = cast[uint64](a mod b)
 
     check: z == tz
+]#
\ No newline at end of file
diff --git a/tests/test_uint_endians2.nim b/tests/test_uint_endians2.nim
index 580b37f..d9acc1f 100644
--- a/tests/test_uint_endians2.nim
+++ b/tests/test_uint_endians2.nim
@@ -36,7 +36,7 @@ template chkFromBytes(chk: untyped, bits: int, hex: string) =
 
 template chkFromBytesBE(chk: untyped, bits: int, hex: string) =
   let x = fromHex(StUint[bits], hex)
-  let z = fromBytesBE(StUint[bits], toBytesBE(x))
+  let z = fromBytesBE(StUint[bits], toByteArrayBE(x))
   chk z == x
 
 template chkFromBytesLE(chk: untyped, bits: int, hex: string) =
@@ -51,28 +51,28 @@ template chkFromToLE(chk: untyped, bits: int, hex: string) =
 
 template chkFromToBE(chk: untyped, bits: int, hex: string) =
   let x = fromHex(StUint[bits], hex)
-  let z = x.fromBE.toBE
+  let z = x.fromBytesBE.toByteArrayBE
   chk z == x
 
 template chkEndians(chkFunc, tst, name: untyped) =
   tst astToStr(name).substr(3):
-    name(chkFunc, 8, "ab")
-    name(chkFunc, 16, "abcd")
-    name(chkFunc, 32, "abcdef12")
-    name(chkFunc, 64, "abcdef1234567890")
+    #name(chkFunc, 8, "ab")
+    #name(chkFunc, 16, "abcd")
+    #name(chkFunc, 32, "abcdef12")
+    #name(chkFunc, 64, "abcdef1234567890")
     name(chkFunc, 128, "abcdef1234567890abcdef1234567890")
     name(chkFunc, 256, "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890")
 
 template testEndians(chkFunc, tst: untyped) =
-  chkEndians(chkFunc, tst, chkSwapBytes)
-  chkEndians(chkFunc, tst, chkToBytes)
-  chkEndians(chkFunc, tst, chkToBytesLE)
+  #chkEndians(chkFunc, tst, chkSwapBytes)
+  #chkEndians(chkFunc, tst, chkToBytes)
+  #chkEndians(chkFunc, tst, chkToBytesLE)
   chkEndians(chkFunc, tst, chkToBytesBE)
-  chkEndians(chkFunc, tst, chkFromBytes)
-  chkEndians(chkFunc, tst, chkFromBytesLE)
-  chkEndians(chkFunc, tst, chkFromBytesBE)
-  chkEndians(chkFunc, tst, chkFromToLE)
-  chkEndians(chkFunc, tst, chkFromToBE)
+  #chkEndians(chkFunc, tst, chkFromBytes)
+  #chkEndians(chkFunc, tst, chkFromBytesLE)
+  #chkEndians(chkFunc, tst, chkFromBytesBE)
+  #chkEndians(chkFunc, tst, chkFromToLE)
+  #chkEndians(chkFunc, tst, chkFromToBE)
 
 static:
   testEndians(ctCheck, ctTest)
@@ -81,16 +81,16 @@ suite "Testing endians":
   test "Endians give sane results":
 
     check:
-      1.u128.toBytesBE() ==
+      1.u128.toByteArrayBE() ==
         [0'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
 
-      1.u128.toBytesLE() ==
-        [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+      #1.u128.toBytesLE() ==
+      #  [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 
       1.u128 == UInt128.fromBytesBE(
         [0'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
 
-      1.u128 == UInt128.fromBytesLE(
-        [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+      #1.u128 == UInt128.fromBytesLE(
+      #  [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 
   testEndians(check, test)
diff --git a/tests/test_uint_exp.nim b/tests/test_uint_exp.nim
index 93b61cd..44fdd49 100644
--- a/tests/test_uint_exp.nim
+++ b/tests/test_uint_exp.nim
@@ -17,7 +17,7 @@ template chkPow(chk: untyped, a: string, b: SomeInteger, c: string, bits: int) =
 
 template testExp(chk, tst: untyped) =
   tst "BigInt BigInt Pow":
-    chkPow(chk, "F", "2", "E1", 8)
+    #[chkPow(chk, "F", "2", "E1", 8)
 
     chkPow(chk, "F", "2", "E1", 16)
     chkPow(chk, "FF", "2", "FE01", 16)
@@ -29,7 +29,7 @@ template testExp(chk, tst: untyped) =
     chkPow(chk, "F", "2", "E1", 64)
     chkPow(chk, "FF", "2", "FE01", 64)
     chkPow(chk, "FF", "3", "FD02FF", 64)
-    chkPow(chk, "FFF", "3", "FFD002FFF", 64)
+    chkPow(chk, "FFF", "3", "FFD002FFF", 64)]#
 
     chkPow(chk, "F", "2", "E1", 128)
     chkPow(chk, "FF", "2", "FE01", 128)
@@ -38,7 +38,7 @@ template testExp(chk, tst: untyped) =
     chkPow(chk, "FFFFF", "3", "ffffd00002fffff", 128)
 
   tst "BigInt Natural Pow":
-    chkPow(chk, "F", 2, "E1", 8)
+    #[chkPow(chk, "F", 2, "E1", 8)
 
     chkPow(chk, "F", 2, "E1", 16)
     chkPow(chk, "FF", 2, "FE01", 16)
@@ -50,7 +50,7 @@ template testExp(chk, tst: untyped) =
     chkPow(chk, "F", 2, "E1", 64)
     chkPow(chk, "FF", 2, "FE01", 64)
     chkPow(chk, "FF", 3, "FD02FF", 64)
-    chkPow(chk, "FFF", 3, "FFD002FFF", 64)
+    chkPow(chk, "FFF", 3, "FFD002FFF", 64)]#
 
     chkPow(chk, "F", 2, "E1", 128)
     chkPow(chk, "FF", 2, "FE01", 128)
@@ -64,6 +64,7 @@ static:
 suite "Wider unsigned int exp coverage":
   testExp(check, test)
 
+#[
 suite "Testing unsigned exponentiation":
   test "Simple exponentiation 5^3":
 
@@ -84,3 +85,4 @@ suite "Testing unsigned exponentiation":
 
     check: a.pow(b) == "4922235242952026704037113243122008064".u256
     check: a.pow(b.stuint(256)) == "4922235242952026704037113243122008064".u256
+]#
\ No newline at end of file
diff --git a/tests/test_uint_modular_arithmetic.nim b/tests/test_uint_modular_arithmetic.nim
index 6b3ad19..db5842f 100644
--- a/tests/test_uint_modular_arithmetic.nim
+++ b/tests/test_uint_modular_arithmetic.nim
@@ -23,7 +23,7 @@ template chkPowMod(chk: untyped, a, b, m, c: string, bits: int) =
 
 template testModArith(chk, tst: untyped) =
   tst "addmod":
-    chkAddMod(chk, "F", "F", "7", "2", 8)
+    #[chkAddMod(chk, "F", "F", "7", "2", 8)
     chkAddMod(chk, "AAAA", "AA", "F", "0", 16)
     chkAddMod(chk, "BBBB", "AAAA", "9", "3", 16)
 
@@ -36,7 +36,7 @@ template testModArith(chk, tst: untyped) =
     chkAddMod(chk, "AAAA", "AA", "F", "0", 64)
     chkAddMod(chk, "BBBB", "AAAA", "9", "3", 64)
     chkAddMod(chk, "BBBBBBBB", "AAAAAAAA", "9", "6", 64)
-    chkAddMod(chk, "BBBBBBBBBBBBBBBB", "AAAAAAAAAAAAAAAA", "9", "3", 64)
+    chkAddMod(chk, "BBBBBBBBBBBBBBBB", "AAAAAAAAAAAAAAAA", "9", "3", 64)]#
 
     chkAddMod(chk, "F", "F", "7", "2", 128)
     chkAddMod(chk, "AAAA", "AA", "F", "0", 128)
@@ -47,7 +47,7 @@ template testModArith(chk, tst: untyped) =
 
 
   tst "submod":
-    chkSubMod(chk, "C", "3", "C", "9", 8)
+    #[chkSubMod(chk, "C", "3", "C", "9", 8)
     chkSubMod(chk, "1", "3", "C", "A", 8)
     chkSubMod(chk, "1", "FF", "C", "A", 8)
 
@@ -64,7 +64,7 @@ template testModArith(chk, tst: untyped) =
     chkSubMod(chk, "1", "3", "C", "A", 64)
     chkSubMod(chk, "1", "FFFF", "C", "A", 64)
     chkSubMod(chk, "1", "FFFFFFFF", "C", "A", 64)
-    chkSubMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "A", 64)
+    chkSubMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "A", 64)]#
 
     chkSubMod(chk, "C", "3", "C", "9", 128)
     chkSubMod(chk, "1", "3", "C", "A", 128)
@@ -74,7 +74,7 @@ template testModArith(chk, tst: untyped) =
     chkSubMod(chk, "1", "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "C", "A", 128)
 
   tst "mulmod":
-    chkMulMod(chk, "C", "3", "C", "0", 8)
+    #[chkMulMod(chk, "C", "3", "C", "0", 8)
     chkMulMod(chk, "1", "3", "C", "3", 8)
     chkMulMod(chk, "1", "FF", "C", "3", 8)
 
@@ -91,7 +91,7 @@ template testModArith(chk, tst: untyped) =
     chkMulMod(chk, "1", "3", "C", "3", 64)
     chkMulMod(chk, "1", "FFFF", "C", "3", 64)
     chkMulMod(chk, "1", "FFFFFFFF", "C", "3", 64)
-    chkMulMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "3", 64)
+    chkMulMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "3", 64)]#
 
     chkMulMod(chk, "C", "3", "C", "0", 128)
     chkMulMod(chk, "1", "3", "C", "3", 128)
@@ -106,7 +106,7 @@ template testModArith(chk, tst: untyped) =
     discard
   else:
     tst "powmod":
-      chkPowMod(chk, "C", "3", "C", "0", 8)
+      #[chkPowMod(chk, "C", "3", "C", "0", 8)
       chkPowMod(chk, "1", "3", "C", "1", 8)
       chkPowMod(chk, "1", "FF", "C", "1", 8)
       chkPowMod(chk, "FF", "3", "C", "3", 8)
@@ -130,7 +130,7 @@ template testModArith(chk, tst: untyped) =
       chkPowMod(chk, "FF", "3", "C", "3", 64)
       chkPowMod(chk, "FFFF", "3", "C", "3", 64)
       chkPowMod(chk, "FFFFFFFF", "3", "C", "3", 64)
-      chkPowMod(chk, "FFFFFFFFFFFFFFFF", "3", "C", "3", 64)
+      chkPowMod(chk, "FFFFFFFFFFFFFFFF", "3", "C", "3", 64)]#
 
       chkPowMod(chk, "C", "3", "C", "0", 128)
       chkPowMod(chk, "1", "3", "C", "1", 128)
@@ -147,6 +147,7 @@ static:
 suite "Wider unsigned Modular arithmetic coverage":
   testModArith(check, test)
 
+#[
 suite "Modular arithmetic":
   test "Modular addition":
 
@@ -202,3 +203,4 @@ suite "Modular arithmetic":
 
       check:
         powmod(P, Q, M) == expected
+]#
diff --git a/tests/test_uint_mul.nim b/tests/test_uint_mul.nim
index 310987a..1d08167 100644
--- a/tests/test_uint_mul.nim
+++ b/tests/test_uint_mul.nim
@@ -10,11 +10,11 @@
 import ../stint, unittest, test_helpers
 
 template chkMul(chk: untyped, a, b, c: string, bits: int) =
-  chk (fromHex(Stuint[bits], a) * fromHex(Stuint[bits], b)) == fromHex(Stuint[bits], c)
+  chk (fromHex(StUint[bits], a) * fromHex(StUint[bits], b)) == fromHex(StUint[bits], c)
 
 template testMul(chk, tst: untyped) =
   tst "operator `mul`":
-    chkMul(chk, "0", "3", "0", 8)
+    #[chkMul(chk, "0", "3", "0", 8)
     chkMul(chk, "1", "3", "3", 8)
     chkMul(chk, "64", "3", "2C", 8) # overflow
 
@@ -34,7 +34,7 @@ template testMul(chk, tst: untyped) =
     chkMul(chk, "64", "3", "12C", 64)
     chkMul(chk, "1770", "46", "668A0", 64)
     chkMul(chk, "13880", "13880", "17D784000", 64)
-    chkMul(chk, "3B9ACA00", "E8D4A51000", "35C9ADC5DEA00000", 64) # overflow
+    chkMul(chk, "3B9ACA00", "E8D4A51000", "35C9ADC5DEA00000", 64) # overflow]#
 
     chkMul(chk, "0", "3", "0", 128)
     chkMul(chk, "1", "3", "3", 128)
@@ -53,6 +53,7 @@ static:
 suite "Wider unsigned int muldiv coverage":
   testMul(check, test)
 
+#[
 suite "Testing unsigned int multiplication implementation":
   test "Multiplication with result fitting in low half":
 
@@ -86,3 +87,4 @@ suite "Testing unsigned int multiplication implementation":
     let x = 9975492817.stuint(256)
     let y = 16.stuint(256)
     check x * y == 159607885072.stuint(256)
+]#
\ No newline at end of file

From e5c352fde24b87d0214d8aa9df9470cbd416cceb Mon Sep 17 00:00:00 2001
From: jangko <jangko128@gmail.com>
Date: Tue, 13 Jun 2023 08:13:39 +0700
Subject: [PATCH 24/26] fix stuint constructor

---
 stint/io.nim | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/stint/io.nim b/stint/io.nim
index 6120ab4..bd311bb 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -45,9 +45,11 @@ template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
 
 func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
   ## Converts an integer to an arbitrary precision integer.
-  result.limbs[0] = Word(n)
   when sizeof(n) > sizeof(Word):
-    result.limbs[1] = Word(n) shr WordBitWidth
+    result.limbs[0] = Word(n and Word.high)  
+    result.limbs[1] = Word(n shr WordBitWidth)
+  else:
+    result.limbs[0] = Word(n)  
 
 # func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
 #   ## Converts an integer to an arbitrary precision signed integer.

From 7ce536423a4b742e1be3000c9dc1e0dc0a48304b Mon Sep 17 00:00:00 2001
From: jangko <jangko128@gmail.com>
Date: Tue, 13 Jun 2023 08:14:52 +0700
Subject: [PATCH 25/26] disable dot borrow temporary

---
 stint/private/datatypes.nim | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 47bbf23..569245c 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -37,19 +37,16 @@ type
     limbs*: array[bits.wordsRequired, Word]
       # Limbs-Endianess is little-endian
 
-when (NimMajor, NimMinor) < (1,9):
-  type
-    StInt*[bits: static[int]] = object
-      ## Stack-based integer
-      ## Signed
-      limbs*: array[bits.wordsRequired, Word]
-else:
-  type
-    StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
-      ## Stack-based integer
-      ## Signed
+  StInt*[bits: static[int]] = object
+    ## Stack-based integer
+    ## Signed
+    limbs*: array[bits.wordsRequired, Word]
+
+  # {.borrow: `.`.} only works with nim-devel
+  # StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
+    ## Stack-based integer
+    ## Signed
 
-type
   Carry* = uint8  # distinct range[0'u8 .. 1]
   Borrow* = uint8 # distinct range[0'u8 .. 1]
 

From 8c5a96463c6f17b2d858ea33d6552bbdcd7a4498 Mon Sep 17 00:00:00 2001
From: jangko <jangko128@gmail.com>
Date: Tue, 13 Jun 2023 08:35:35 +0700
Subject: [PATCH 26/26] nimvm workaround for primitives

---
 stint/private/primitives/addcarry_subborrow.nim |  8 ++++----
 stint/private/primitives/extended_precision.nim | 15 ++++++++++++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/stint/private/primitives/addcarry_subborrow.nim b/stint/private/primitives/addcarry_subborrow.nim
index 75aa21a..3751602 100644
--- a/stint/private/primitives/addcarry_subborrow.nim
+++ b/stint/private/primitives/addcarry_subborrow.nim
@@ -107,14 +107,14 @@ func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.
   ## (CarryOut, Sum) <- a + b + CarryIn
   when nimvm:
     let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
-    sum = (uint32)(dblPrec)
+    sum = uint32(dblPrec and uint32.high)
     cOut = Carry(dblPrec shr 32)
   else:
     when X86:
       cOut = addcarry_u32(cIn, a, b, sum)
     else:
       let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
-      sum = (uint32)(dblPrec)
+      sum = uint32(dblPrec)
       cOut = Carry(dblPrec shr 32)
 
 func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
@@ -122,7 +122,7 @@ func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inli
   ## (BorrowOut, Diff) <- a - b - borrowIn
   when nimvm:
     let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
-    diff = (uint32)(dblPrec)
+    diff = uint32(dblPrec and uint32.high)
     # On borrow the high word will be 0b1111...1111 and needs to be masked
     bOut = Borrow((dblPrec shr 32) and 1)
   else:
@@ -130,7 +130,7 @@ func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inli
       bOut = subborrow_u32(bIn, a, b, diff)
     else:
       let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
-      diff = (uint32)(dblPrec)
+      diff = uint32(dblPrec)
       # On borrow the high word will be 0b1111...1111 and needs to be masked
       bOut = Borrow((dblPrec shr 32) and 1)
 
diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim
index cd04828..4f58e65 100644
--- a/stint/private/primitives/extended_precision.nim
+++ b/stint/private/primitives/extended_precision.nim
@@ -41,7 +41,10 @@ func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
   ## Extended precision multiplication
   ## (hi, lo) <- a*b
   let dblPrec = uint64(a) * uint64(b)
-  lo = uint32(dblPrec)
+  when nimvm:
+    lo = uint32(dblPrec and uint32.high)
+  else:
+    lo = uint32(dblPrec)
   hi = uint32(dblPrec shr 32)
 
 func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
@@ -51,7 +54,10 @@ func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
   ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
   ##       so adding any c cannot overflow
   let dblPrec = uint64(a) * uint64(b) + uint64(c)
-  lo = uint32(dblPrec)
+  when nimvm:
+    lo = uint32(dblPrec and uint32.high)
+  else:
+    lo = uint32(dblPrec)
   hi = uint32(dblPrec shr 32)
 
 func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
@@ -63,7 +69,10 @@ func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
   ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
   ##       and we have enough space to add again 0xFFFFFFFF without overflowing
   let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
-  lo = uint32(dblPrec)
+  when nimvm:
+    lo = uint32(dblPrec and uint32.high)
+  else:
+    lo = uint32(dblPrec)
   hi = uint32(dblPrec shr 32)
 
 # ############################################################