diff --git a/README.md b/README.md
index 0a5906c..76b4191 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,7 @@ Main focus:
     - Uint2048 for Ethereum Bloom filters
   - Ease of use:
     - Use traditional `+`, `-`, `+=`, etc operators like on native types
-    - Representation of numbers in memory is the exact same as native types and endianness aware.
-      - In practice that means that interfacing with binary blobs representing numbers from cryptographic    libraries can be done with a `cast` if it represents a Uint256, Uint512, Uint1024, Uint2048.
+    - converting to and from raw byte BigInts (also called octet string in IETF specs)
     - converting to and from Hex
     - converting to and from decimal strings
 
diff --git a/benchmarks/bench.nim b/benchmarks/bench.nim
new file mode 100644
index 0000000..36d9a55
--- /dev/null
+++ b/benchmarks/bench.nim
@@ -0,0 +1,91 @@
+import ../stint, std/[times, monotimes]
+
+template bench(desc: string, body: untyped) =
+  let start = getMonotime()
+  body
+  let stop = getMonotime()
+  echo desc,": ", inMilliseconds(stop-start), " ms"
+
+# Warmup on normal int to ensure max CPU freq
+# Complex enough that the compiler doesn't optimize it away
+
+proc warmup() =
+  var foo = 123
+  bench "Warmup":
+    for i in 0 ..< 10_000_000:
+      foo += i*i mod 456
+      foo = foo mod 789
+
+warmup()
+####################################
+
+let a = [123'u64, 123'u64, 123'u64, 123'u64]
+let m = [456'u64, 456'u64, 456'u64, 45'u64]
+
+proc add_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Add (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += mU256
+      foo += aU256
+
+proc mul_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Mul (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += (foo * foo)
+
+proc mod_stint(a, m: array[4, uint64]) =
+  let aU256 = cast[Stuint[256]](a)
+  let mU256 = cast[Stuint[256]](m)
+
+  bench "Mod (stint)":
+    var foo = aU256
+    for i in 0 ..< 100_000_000:
+      foo += (foo * foo) mod mU256
+
+add_stint(a, m)
+mul_stint(a, m)
+mod_stint(a, m)
+
+when defined(bench_ttmath):
+  # need C++
+  import ttmath, ../tests/ttmath_compat
+
+  proc add_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
+
+    bench "Add (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += mU256
+        foo += aU256
+
+  proc mul_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
+
+    bench "Mul (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += (foo * foo)
+
+  proc mod_ttmath(a, m: array[4, uint64]) =
+    let aU256 = a.astt()
+    let mU256 = m.astt()
+
+    bench "Mod (ttmath)":
+      var foo = aU256
+      for i in 0 ..< 100_000_000:
+        foo += (foo * foo) mod mU256
+
+  add_ttmath(a, m)
+  mul_ttmath(a, m)
+  mod_ttmath(a, m)
\ No newline at end of file
diff --git a/benchmarks/bench_mod.nim b/benchmarks/bench_mod.nim
deleted file mode 100644
index 0f79b68..0000000
--- a/benchmarks/bench_mod.nim
+++ /dev/null
@@ -1,55 +0,0 @@
-import ../stint, times
-
-
-# Warmup on normal int
-var start = cpuTime()
-block:
-  var foo = 123
-  for i in 0 ..< 10_000_000:
-    foo += i*i mod 456
-    foo = foo mod 789
-
-# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
-var stop = cpuTime()
-echo "Warmup: " & $(stop - start) & "s"
-
-####################################
-
-
-start = cpuTime()
-block:
-  var foo = 123.u256
-  for i in 0 ..< 10_000_000:
-    foo += i.u256 * i.u256 mod 456.u256
-    foo = foo mod 789.u256
-
-stop = cpuTime()
-echo "Library: " & $(stop - start) & "s"
-
-when defined(bench_ttmath):
-  # need C++
-  import ttmath
-
-  template tt_u256(a: int): UInt[256] = ttmath.u256(a.uint)
-
-  start = cpuTime()
-  block:
-    var foo = 123.tt_u256
-    for i in 0 ..< 10_000_000:
-      foo += i.tt_u256 * i.tt_u256 mod 456.tt_u256
-      foo = foo mod 789.tt_u256
-
-  stop = cpuTime()
-  echo "TTMath: " & $(stop - start) & "s"
-
-# On my i5-5257 broadwell with the flags:
-# nim c -d:release -d:bench_ttmath
-# Warmup: 0.04060799999999999s
-# Library: 0.9576759999999999s
-# TTMath: 0.758443s
-
-
-# After PR #54 for compile-time evaluation
-# which includes loop unrolling but may bloat the code
-# Warmup: 0.03993500000000001s
-# Library: 0.848464s
diff --git a/helpers/prng_unsafe.nim b/helpers/prng_unsafe.nim
new file mode 100644
index 0000000..d977e6d
--- /dev/null
+++ b/helpers/prng_unsafe.nim
@@ -0,0 +1,260 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../stint/io,
+  ../stint/private/datatypes
+
+# ############################################################
+#
+#              Pseudo-Random Number Generator
+#          for testing and benchmarking purposes
+#
+# ############################################################
+#
+# The recommendation by Vigna at http://prng.di.unimi.it
+# is to have a period of t^2 if we need t values (i.e. about 2^1024)
+# but also that for all practical purposes 2^256 period is enough
+#
+# We use 2^512 since our main use-case is uint256
+
+type RngState* = object
+  ## This is the state of a Xoshiro512** PRNG
+  ## Unsafe: for testing and benchmarking purposes only
+  s: array[8, uint64]
+
+func splitMix64(state: var uint64): uint64 =
+  state += 0x9e3779b97f4a7c15'u64
+  result = state
+  result = (result xor (result shr 30)) * 0xbf58476d1ce4e5b9'u64
+  result = (result xor (result shr 27)) * 0xbf58476d1ce4e5b9'u64
+  result = result xor (result shr 31)
+
+func seed*(rng: var RngState, x: SomeInteger) =
+  ## Seed the random number generator with a fixed seed
+  var sm64 = uint64(x)
+  rng.s[0] = splitMix64(sm64)
+  rng.s[1] = splitMix64(sm64)
+  rng.s[2] = splitMix64(sm64)
+  rng.s[3] = splitMix64(sm64)
+  rng.s[4] = splitMix64(sm64)
+  rng.s[5] = splitMix64(sm64)
+  rng.s[6] = splitMix64(sm64)
+  rng.s[7] = splitMix64(sm64)
+
+func rotl(x: uint64, k: static int): uint64 {.inline.} =
+  return (x shl k) or (x shr (64 - k))
+
+template `^=`(x: var uint64, y: uint64) =
+  x = x xor y
+
+func next(rng: var RngState): uint64 =
+  ## Compute a random uint64 from the input state
+  ## using xoshiro512** algorithm by Vigna et al
+  ## State is updated.
+  result = rotl(rng.s[1] * 5, 7) * 9
+
+  let t = rng.s[1] shl 11
+  rng.s[2] ^= rng.s[0];
+  rng.s[5] ^= rng.s[1];
+  rng.s[1] ^= rng.s[2];
+  rng.s[7] ^= rng.s[3];
+  rng.s[3] ^= rng.s[4];
+  rng.s[4] ^= rng.s[5];
+  rng.s[0] ^= rng.s[6];
+  rng.s[6] ^= rng.s[7];
+
+  rng.s[6] ^= t;
+
+  rng.s[7] = rotl(rng.s[7], 21);
+
+# Bithacks
+# ------------------------------------------------------------
+
+proc clearMask[T: SomeInteger](v: T, mask: T): T {.inline.} =
+  ## Returns ``v``, with all the ``1`` bits from ``mask`` set to 0
+  v and not mask
+
+proc clearBit*[T: SomeInteger](v: T, bit: T): T {.inline.} =
+  ## Returns ``v``, with the bit at position ``bit`` set to 0
+  v.clearMask(1.T shl bit)
+
+# Integer ranges
+# ------------------------------------------------------------
+
+func random_unsafe*(rng: var RngState, maxExclusive: uint32): uint32 =
+  ## Generate a random integer in 0 ..< maxExclusive
+  ## Uses an unbiaised generation method
+  ## See Lemire's algorithm modified by Melissa O'Neill
+  ##   https://www.pcg-random.org/posts/bounded-rands.html
+  let max = maxExclusive
+  var x = uint32 rng.next()
+  var m = x.uint64 * max.uint64
+  var l = uint32 m
+  if l < max:
+    var t = not(max) + 1 # -max
+    if t >= max:
+      t -= max
+      if t >= max:
+        t = t mod max
+    while l < t:
+      x = uint32 rng.next()
+      m = x.uint64 * max.uint64
+      l = uint32 m
+  return uint32(m shr 32)
+
+func random_unsafe*[T: SomeInteger](rng: var RngState, inclRange: Slice[T]): T =
+  ## Return a random integer in the given range.
+  ## The range bounds must fit in an int32.
+  let maxExclusive = inclRange.b + 1 - inclRange.a
+  result = T(rng.random_unsafe(uint32 maxExclusive))
+  result += inclRange.a
+
+# Containers
+# ------------------------------------------------------------
+
+func sample_unsafe*[T](rng: var RngState, src: openarray[T]): T =
+  ## Return a random sample from an array
+  result = src[rng.random_unsafe(uint32 src.len)]
+
+# BigInts
+# ------------------------------------------------------------
+#
+# Statistics note:
+# - A skewed distribution is not symmetric, it has a longer tail in one direction.
+#   for example a RNG that is not centered over 0.5 distribution of 0 and 1 but
+#   might produces more 1 than 0 or vice-versa.
+# - A bias is a result that is consistently off from the true value i.e.
+#   a deviation of an estimate from the quantity under observation
+
+func random_unsafe(rng: var RngState, a: var SomeBigInteger) =
+  ## Initialize a standalone BigInt
+  for i in 0 ..< a.limbs.len:
+    a.limbs[i] = Word(rng.next())
+
+func random_word_highHammingWeight(rng: var RngState): Word =
+  let numZeros = rng.random_unsafe(WordBitWidth div 3) # Average Hamming Weight is 1-0.33/2 = 0.83
+  result = high(Word)
+  for _ in 0 ..< numZeros:
+    result = result.clearBit rng.random_unsafe(WordBitWidth)
+
+func random_highHammingWeight(rng: var RngState, a: var SomeBigInteger) =
+  ## Initialize a standalone BigInt
+  ## with high Hamming weight
+  ## to have a higher probability of triggering carries
+  for i in 0 ..< a.limbs.len:
+    a.limbs[i] = Word rng.random_word_highHammingWeight()
+
+func random_long01Seq(rng: var RngState, a: var openArray[byte]) =
+  ## Initialize a bytearray
+  ## It is skewed towards producing strings of 1111... and 0000
+  ## to trigger edge cases
+  # See libsecp256k1: https://github.com/bitcoin-core/secp256k1/blob/dbd41db1/src/testrand_impl.h#L90-L104
+  let Bits = a.len * 8
+  var bit = 0
+  zeroMem(a[0].addr, a.len)
+  while bit < Bits :
+    var now = 1 + (rng.random_unsafe(1 shl 6) * rng.random_unsafe(1 shl 5) + 16) div 31
+    let val = rng.sample_unsafe([0, 1])
+    while now > 0 and bit < Bits:
+      a[bit shr 3] = a[bit shr 3] or byte(val shl (bit and 7))
+      dec now
+      inc bit
+
+func random_long01Seq(rng: var RngState, a: var SomeBigInteger) =
+  ## Initialize a bigint
+  ## It is skewed towards producing strings of 1111... and 0000
+  ## to trigger edge cases
+  var buf: array[(a.bits + 7) div 8, byte]
+  rng.random_long01Seq(buf)
+  a = (typeof a).fromBytes(buf, bigEndian)
+
+# Byte sequences
+# ------------------------------------------------------------
+
+func random_byte_seq*(rng: var RngState, length: int): seq[byte] =
+  result.newSeq(length)
+  for b in result.mitems:
+    b = byte rng.next()
+
+# Generic over any Stint type
+# ------------------------------------------------------------
+
+func random_unsafe*(rng: var RngState, T: typedesc): T =
+  ## Create a random Field or Extension Field or Curve Element
+  ## Unsafe: for testing and benchmarking purposes only
+  rng.random_unsafe(result)
+
+func random_highHammingWeight*(rng: var RngState, T: typedesc): T =
+  ## Create a random Field or Extension Field or Curve Element
+  ## Skewed towards high Hamming Weight
+  rng.random_highHammingWeight(result)
+
+func random_long01Seq*(rng: var RngState, T: typedesc): T =
+  ## Create a random Field or Extension Field or Curve Element
+  ## Skewed towards long bitstrings of 0 or 1
+  rng.random_long01Seq(result)
+
+type
+  RandomGen* = enum
+    Uniform
+    HighHammingWeight
+    Long01Sequence
+
+func random_elem*(rng: var RngState, T: typedesc, gen: RandomGen): T {.inline, noInit.} =
+  case gen
+  of Uniform:
+    result = rng.random_unsafe(T)
+  of HighHammingWeight:
+    result = rng.random_highHammingWeight(T)
+  of Long01Sequence:
+    result = rng.random_long01Seq(T)
+
+# Sanity checks
+# ------------------------------------------------------------
+
+when isMainModule:
+  import std/[tables, times, strutils]
+
+  var rng: RngState
+  let timeSeed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  rng.seed(timeSeed)
+  echo "prng_sanity_checks xoshiro512** seed: ", timeSeed
+
+
+  proc test[T](s: Slice[T]) =
+    var c = initCountTable[int]()
+
+    for _ in 0 ..< 1_000_000:
+      c.inc(rng.random_unsafe(s))
+
+    echo "1'000'000 pseudo-random outputs from ", s.a, " to ", s.b, " (incl): ", c
+
+  test(0..1)
+  test(0..2)
+  test(1..52)
+  test(-10..10)
+
+  echo "\n-----------------------------\n"
+  echo "High Hamming Weight check"
+  for _ in 0 ..< 10:
+    let word = rng.random_word_highHammingWeight()
+    echo "0b", cast[BiggestInt](word).toBin(WordBitWidth), " - 0x", word.toHex()
+
+  echo "\n-----------------------------\n"
+  echo "Long strings of 0 or 1 check"
+  for _ in 0 ..< 10:
+    var a: BigInt[127]
+    rng.random_long01seq(a)
+    stdout.write "0b"
+    for word in a.limbs:
+      stdout.write cast[BiggestInt](word).toBin(WordBitWidth)
+    stdout.write " - 0x"
+    for word in a.limbs:
+      stdout.write word.BaseType.toHex()
+    stdout.write '\n'
diff --git a/helpers/staticfor.nim b/helpers/staticfor.nim
new file mode 100644
index 0000000..95a3864
--- /dev/null
+++ b/helpers/staticfor.nim
@@ -0,0 +1,29 @@
+import std/macros
+
+proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
+  # Replace "what" ident node by "by"
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      if node.eqIdent(what):
+        return by
+      return node
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
+  ## staticFor [min inclusive, max exclusive)
+  result = newStmtList()
+  for i in start ..< stopEx:
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
\ No newline at end of file
diff --git a/stint.nim b/stint.nim
index 2b7c29a..64f5a42 100644
--- a/stint.nim
+++ b/stint.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-2023 Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -7,8 +7,11 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import stint/[bitops2, endians2, intops, io, modular_arithmetic, literals_stint]
-export bitops2, endians2, intops, io, modular_arithmetic, literals_stint
+# import stint/[bitops2, endians2, intops, io, modular_arithmetic, literals_stint]
+# export bitops2, endians2, intops, io, modular_arithmetic, literals_stint
+
+import stint/[io, uintops, intops, literals_stint, modular_arithmetic]
+export io, uintops, intops, literals_stint, modular_arithmetic
 
 type
   Int128* = StInt[128]
@@ -23,7 +26,7 @@ func u256*(n: SomeInteger): UInt256 {.inline.} = n.stuint(256)
 func u256*(s: string): UInt256 {.inline.} = s.parse(UInt256)
 
 func i128*(n: SomeInteger): Int128 {.inline.} = n.stint(128)
-func i128*(s: string): Int128 {.inline.} = s.parse(Int128)
+# func i128*(s: string): Int128 {.inline.} = s.parse(Int128)
 
 func i256*(n: SomeInteger): Int256 {.inline.} = n.stint(256)
-func i256*(s: string): Int256 {.inline.} = s.parse(Int256)
+# func i256*(s: string): Int256 {.inline.} = s.parse(Int256)
diff --git a/stint.nimble b/stint.nimble
index 3eeacfd..b7fb0a5 100644
--- a/stint.nimble
+++ b/stint.nimble
@@ -1,5 +1,5 @@
 packageName   = "stint"
-version       = "0.0.1"
+version       = "2.0.0"
 author        = "Status Research & Development GmbH"
 description   = "Efficient stack-based multiprecision int in Nim"
 license       = "Apache License 2.0 or MIT"
@@ -7,13 +7,13 @@ skipDirs      = @["tests", "benchmarks"]
 ### Dependencies
 
 # TODO test only requirements don't work: https://github.com/nim-lang/nimble/issues/482
-requires "nim >= 1.6.0",
+requires "nim >= 1.6.12",
          "stew"
- #, "https://github.com/alehander42/nim-quicktest >= 0.18.0", "https://github.com/status-im/nim-ttmath"
 
 proc test(args, path: string) =
   if not dirExists "build":
     mkDir "build"
+
   exec "nim " & getEnv("TEST_LANG", "c") & " " & getEnv("NIMFLAGS") & " " & args &
     " --outdir:build -r --hints:off --warnings:off --skipParentCfg" &
     " --styleCheck:usages --styleCheck:error " & path
@@ -22,23 +22,17 @@ proc test(args, path: string) =
       " --outdir:build -r --mm:refc --hints:off --warnings:off --skipParentCfg" &
       " --styleCheck:usages --styleCheck:error " & path
 
-task test, "Run all tests - test and production implementation":
-  # Run tests for internal procs - test implementation (StUint[64] = 2x uint32
-  test "-d:stint_test", "tests/internal.nim"
-  # Run tests for internal procs - prod implementation (StUint[64] = uint64
-  test "", "tests/internal.nim"
-  # Run all tests - test implementation (StUint[64] = 2x uint32
-  test "-d:stint_test", "tests/all_tests.nim"
-  # Run all tests - prod implementation (StUint[64] = uint64
-  test "--threads:on", "tests/all_tests.nim"
+task test_internal, "Run tests for internal procs":
+  test "", "tests/internal"
+
+task test_public_api, "Run all tests - prod implementation (StUint[64] = uint64":
+  test "", "tests/all_tests"
+
+task test_uint256_ttmath, "Run random tests Uint256 vs TTMath":
+  requires "https://github.com/alehander42/nim-quicktest >= 0.18.0", "https://github.com/status-im/nim-ttmath"
+  switch("define", "release")
+  test "uint256_ttmath", "cpp"
 
-  ## quicktest-0.20.0/quicktest.nim(277, 30) Error: cannot evaluate at compile time: BUILTIN_NAMES
-  ##
-  # # Run random tests (debug mode) - test implementation (StUint[64] = 2x uint32)
-  # test "-d:stint_test", "tests/property_based.nim"
-  # # Run random tests (release mode) - test implementation (StUint[64] = 2x uint32)
-  # test "-d:stint_test -d:release", "tests/property_based.nim"
-  # # Run random tests Uint256 (debug mode) vs TTMath (StUint[256] = 8 x uint32)
-  # test "", "tests/property_based.nim"
-  # # Run random tests Uint256 (release mode) vs TTMath (StUint[256] = 4 x uint64)
-  # test "-d:release", "tests/property_based.nim"
+task test, "Run all tests":
+  exec "nimble test_internal"
+  exec "nimble test_public_api"
diff --git a/stint/bitops2.nim b/stint/bitops2.nim
deleted file mode 100644
index 9bad7c8..0000000
--- a/stint/bitops2.nim
+++ /dev/null
@@ -1,16 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./private/[bitops2_priv, datatypes]
-
-func countOnes*(x: StUint): int {.inline.} = countOnes(x.data)
-func parity*(x: StUint): int {.inline.} = parity(x.data)
-func firstOne*(x: StUint): int {.inline.} = firstOne(x.data)
-func leadingZeros*(x: StUint): int {.inline.} = leadingZeros(x.data)
-func trailingZeros*(x: StUint): int {.inline.} = trailingZeros(x.data)
diff --git a/stint/endians2.nim b/stint/endians2.nim
index be6beea..6bc2990 100644
--- a/stint/endians2.nim
+++ b/stint/endians2.nim
@@ -7,102 +7,243 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import private/[bitops2_priv, endians2_priv, datatypes, compiletime_helpers]
+import private/datatypes
 
-import stew/endians2
-export endians2
+{.push raises: [IndexDefect], noinit, gcsafe.}
 
-func swapBytes*(x: StUint): StUint {.inline.} = StUint(data: swapBytes(x.data))
-
-func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = system.cpuEndian):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x.data, endian)
-
-func toBytesLE*[bits: static int](x: StUint[bits]):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x, littleEndian)
-
-func toBytesBE*[bits: static int](x: StUint[bits]):
-    array[bits div 8, byte] {.inline.} =
-  toBytes(x, bigEndian)
-
-func fromBytes*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: array[bits div 8, byte],
-    endian: Endianness = system.cpuEndian): T {.inline, noinit.} =
+# Serialization
+# ------------------------------------------------------------------------------------------
 
+template toByte(x: SomeUnsignedInt): byte =
+  ## At compile-time, conversion to bytes checks the range
+  ## we want to ensure this is done at the register level
+  ## at runtime in a single "mov byte" instruction
   when nimvm:
-    copyFromArray(result.data, x)
+    byte(x and 0xFF)
   else:
-    copyMem(addr result, unsafeAddr x[0], bits div 8)
-
-  if endian != system.cpuEndian:
-    result = swapBytes(result)
-
-func fromBytes*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: openArray[byte],
-    endian: Endianness = system.cpuEndian): T {.inline.} =
-  # TODO fromBytesBE in io.nim handles this better, merge the two!
-  var tmp: array[bits div 8, byte]
-  if x.len < tmp.len:
-    let offset = if endian == bigEndian: tmp.len - x.len else: 0
-    for i in 0..<x.len: # Loop since vm can't copymem
-      tmp[i + offset] = x[i]
+    byte(x)
+
+template blobFrom(dst: var openArray[byte], src: SomeUnsignedInt, startIdx: int, endian: static Endianness) =
+  ## Write an integer into a raw binary blob
+  ## Swapping endianness if needed
+  when endian == cpuEndian:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+i] = toByte((src shr (i * 8)))
+  else:
+    for i in 0 ..< sizeof(src):
+      dst[startIdx+sizeof(src)-1-i] = toByte((src shr (i * 8)))
+
+func toBytesLE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] =
+  var
+    src_idx, dst_idx = 0
+    acc: Word = 0
+    acc_len = 0
+
+  when cpuEndian == bigEndian:
+    srcIdx = src.limbs.len - 1
+
+  var tail = result.len
+  while tail > 0:
+    when cpuEndian == littleEndian:
+      let w = if src_idx < src.limbs.len: src.limbs[src_idx]
+              else: 0
+      inc src_idx
+    else:
+      let w = if src_idx >= 0: src.limbs[src_idx]
+              else: 0
+      dec src_idx
+
+    if acc_len == 0:
+      # We need to refill the buffer to output 64-bit
+      acc = w
+      acc_len = WordBitWidth
+    else:
+      let lo = acc
+      acc = w
+
+      if tail >= sizeof(Word):
+        # Unrolled copy
+        result.blobFrom(src = lo, dst_idx, littleEndian)
+        dst_idx += sizeof(Word)
+        tail -= sizeof(Word)
+      else:
+        # Process the tail and exit
+        when cpuEndian == littleEndian:
+          # When requesting little-endian on little-endian platform
+          # we can just copy each byte
+          # tail is inclusive
+          for i in 0 ..< tail:
+            result[dst_idx+i] = toByte(lo shr (i*8))
+        else: # TODO check this
+          # We need to copy from the end
+          for i in 0 ..< tail:
+            result[dst_idx+i] = toByte(lo shr ((tail-i)*8))
+        return
+
+func toBytesBE*[bits: static int](src: StUint[bits]): array[bits div 8, byte] {.inline.} =
+  var
+    src_idx = 0
+    acc: Word = 0
+    acc_len = 0
+
+  when cpuEndian == bigEndian:
+    srcIdx = src.limbs.len - 1
+
+  var tail = result.len
+  while tail > 0:
+    when cpuEndian == littleEndian:
+      let w = if src_idx < src.limbs.len: src.limbs[src_idx]
+              else: 0
+      inc src_idx
+    else:
+      let w = if src_idx >= 0: src.limbs[src_idx]
+              else: 0
+      dec src_idx
+
+    if acc_len == 0:
+      # We need to refill the buffer to output 64-bit
+      acc = w
+      acc_len = WordBitWidth
+    else:
+      let lo = acc
+      acc = w
+
+      if tail >= sizeof(Word):
+        # Unrolled copy
+        tail -= sizeof(Word)
+        result.blobFrom(src = lo, tail, bigEndian)
+      else:
+        # Process the tail and exit
+        when cpuEndian == littleEndian:
+          # When requesting little-endian on little-endian platform
+          # we can just copy each byte
+          # tail is inclusive
+          for i in 0 ..< tail:
+            result[tail-1-i] = toByte(lo shr (i*8))
+        else:
+          # We need to copy from the end
+          for i in 0 ..< tail:
+            result[tail-1-i] = toByte(lo shr ((tail-i)*8))
+        return
+
+func toBytes*[bits: static int](x: StUint[bits], endian: Endianness = bigEndian): array[bits div 8, byte] {.inline.} =
+  ## Default to bigEndian
+  if endian == littleEndian:
+    result = x.toBytesLE()
   else:
-    for i in 0..<tmp.len: # Loop since vm can't copymem
-      tmp[i] = x[i]
-  fromBytes(T, tmp, endian)
+    result = x.toBytesBE()
 
-func fromBytesBE*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: array[bits div 8, byte]): T {.inline.} =
-  ## Read big endian bytes and convert to an integer. By default, native
-  ## endianess is used which is not
-  ## portable!
-  fromBytes(T, x, bigEndian)
+# Deserialization
+# ------------------------------------------------------------------------------------------
 
 func fromBytesBE*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: openArray[byte]): T {.inline.} =
+    x: openArray[byte]): T {.raises: [], noinit, gcsafe.} =
   ## Read big endian bytes and convert to an integer. At runtime, v must contain
-  ## at least sizeof(T) bytes. By default, native endianess is used which is not
-  ## portable!
-  fromBytes(T, x, bigEndian)
+  ## at least sizeof(T) bytes. Native endianess is used which is not
+  ## portable! (i.e. use fixed-endian byte array or hex for serialization)
 
-func toBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
-  ## Convert a native endian value to big endian. Consider toBytesBE instead
-  ## which may prevent some confusion.
-  if cpuEndian == bigEndian: x
-  else: x.swapBytes
+  var accum: Word
+  var accumBits: int
+  var dstIdx: int
 
-func fromBE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
-  ## Read a big endian value and return the corresponding native endian
-  # there's no difference between this and toBE, except when reading the code
-  toBE(x)
+  when cpuEndian == littleEndian: # src is bigEndian, CPU is little-endian
+    dstIdx = 0
 
-func fromBytesLE*[bits: static int](
-    T: typedesc[StUint[bits]],
-    x: array[bits div 8, byte]): StUint[bits] {.inline.} =
-  ## Read little endian bytes and convert to an integer. By default, native
-  ## endianess is used which is not portable!
-  fromBytes(T, x, littleEndian)
+    for srcIdx in countdown(x.len-1, 0):
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        inc dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx < result.limbs.len:
+      result.limbs[dstIdx] = accum
+      for fillIdx in dstIdx+1 ..< result.limbs.len:
+        result.limbs[fillIdx] = 0
+  else:                          # src and CPU are bigEndian
+    dstIdx = result.limbs.len-1
+
+    for srcIdx in countdown(x.len-1, 0):
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        dec dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx > 0:
+      result.limbs[dstIdx] = accum
+      for fillIdx in 0 ..< dstIdx:
+        result.limbs[fillIdx] = 0
 
 func fromBytesLE*[bits: static int](
     T: typedesc[StUint[bits]],
-    x: openArray[byte]): StUint[bits] {.inline.} =
+    x: openArray[byte]): T =
   ## Read little endian bytes and convert to an integer. At runtime, v must
   ## contain at least sizeof(T) bytes. By default, native endianess is used
-  ## which is not portable!
-  fromBytes(T, x, littleEndian)
-
-func toLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
-  ## Convert a native endian value to little endian. Consider toBytesLE instead
-  ## which may prevent some confusion.
-  if cpuEndian == littleEndian: x
-  else: x.swapBytes
-
-func fromLE*[bits: static int](x: StUint[bits]): StUint[bits] {.inline.} =
-  ## Read a little endian value and return the corresponding native endian
-  # there's no difference between this and toLE, except when reading the code
-  toLE(x)
+  ## which is not portable! (i.e. use fixed-endian byte array or hex for serialization)
+
+  var accum: Word
+  var accumBits: int
+  var dstIdx: int
+
+  when cpuEndian == littleEndian: # src and CPU are little-endian
+    dstIdx = 0
+
+    for srcIdx in 0 ..< x.len:
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        inc dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx < result.limbs.len:
+      result.limbs[dstIdx] = accum
+      for fillIdx in dstIdx+1 ..< result.limbs.len:
+        result.limbs[fillIdx] = 0
+  else:                          # src is little endian, CPU is bigEndian
+    dstIdx = result.limbs.len-1
+
+    for srcIdx in 0 ..< x.len:
+      let srcByte = x[srcIdx]
+
+      accum = accum or (srcByte shl accumBits)
+      accumBits += 8
+
+      if accumBits >= WordBitWidth:
+        result.limbs[dstIdx] = accum
+        dec dstIdx
+        accumBits -= WordBitWidth
+        accum = srcByte shr (8 - accumBits)
+
+    if dstIdx > 0:
+      result.limbs[dstIdx] = accum
+      for fillIdx in 0 ..< dstIdx:
+        result.limbs[fillIdx] = 0
+
+func fromBytes*[bits: static int](
+    T: typedesc[StUint[bits]],
+    x: openArray[byte],
+    srcEndian: Endianness = bigEndian): T {.inline.} =
+  ## Read an source bytearray with the specified endianness and
+  ## convert it to an integer
+  ## Default to bigEndian
+  if srcEndian == littleEndian:
+    result = fromBytesLE(T, x)
+  else:
+    result = fromBytesBE(T, x)
diff --git a/stint/intops.nim b/stint/intops.nim
index fa97d52..90ed0d2 100644
--- a/stint/intops.nim
+++ b/stint/intops.nim
@@ -7,19 +7,18 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ./private/[bitops2_priv, datatypes]
+import ./private/[datatypes]
 
-export StInt, StUint
-export IntImpl, intImpl, UintImpl, uintImpl, bitsof # TODO: remove the need to export those
+export StInt
+#export IntImpl, intImpl, UintImpl, uintImpl, bitsof # TODO: remove the need to export those
 
-type SomeBigInteger = StUint|StInt
+#import ./private/initialization
 
-import ./private/initialization
-
-func zero*[bits: static[int]](T: typedesc[StUint[bits] or StInt[bits]]): T {.inline.} =
+func zero*[bits: static[int]](T: typedesc[StInt[bits]]): T {.inline.} =
   ## Returns the zero of the input type
   discard
-
+  
+#[
 func one*[bits: static[int]](T: typedesc[StUint[bits]]): T {.inline.} =
   ## Returns the one of the input type
   result.data = one(type result.data)
@@ -159,3 +158,4 @@ func pow*(x: StUint, y: StUint): StUint {.inline.} =
     result.data = x.data.pow(y.data)
   else:
     result.data = x.data ^ y.data
+]#
\ No newline at end of file
diff --git a/stint/io.nim b/stint/io.nim
index a512c05..bd311bb 100644
--- a/stint/io.nim
+++ b/stint/io.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-2023 Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -8,11 +8,30 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
+  # Standard library
+  typetraits, algorithm, hashes,
+  # Status libraries
+  # stew/byteutils,
+  # Internal
   ./private/datatypes,
-  ./private/int_negabs,
-  ./private/compiletime_helpers,
-  ./intops,
-  typetraits, algorithm, hashes
+  # ./private/int_negabs,
+  # ./private/compiletime_helpers,
+  # ./intops,
+  ./uintops, ./endians2
+
+from stew/byteutils import toHex # Why are we exporting readHexChar in byteutils?
+
+template leastSignificantWord*(a: SomeBigInteger): Word =
+  a.limbs[0]
+
+template mostSignificantWord*(a: SomeBigInteger): Word =
+  a.limbs[^1]
+
+template signedWordType*(_: type SomeBigInteger): type =
+  SignedWord
+
+template wordType*(_: type SomeBigInteger): type =
+  Word
 
 template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
   # To avoid a costly runtime check, we refuse storing into StUint types smaller
@@ -24,178 +43,134 @@ template static_check_size(T: typedesc[SomeInteger], bits: static[int]) =
             "\nUse a smaller input type instead. This is a compile-time check" &
             " to avoid a costly run-time bit_length check at each StUint initialization."
 
-func assignLo(result: var (UintImpl | IntImpl), n: SomeInteger) {.inline.} =
-  when result.lo is UintImpl:
-    assignLo(result.lo, n)
-  else:
-    result.lo = (type result.lo)(n)
-
 func stuint*[T: SomeInteger](n: T, bits: static[int]): StUint[bits] {.inline.}=
   ## Converts an integer to an arbitrary precision integer.
-
-  doAssert n >= 0.T
-  when result.data is UintImpl:
-    static_check_size(T, bits)
-    assignLo(result.data, n)
+  when sizeof(n) > sizeof(Word):
+    result.limbs[0] = Word(n and Word.high)  
+    result.limbs[1] = Word(n shr WordBitWidth)
   else:
-    result.data = (type result.data)(n)
-
-func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
-  ## Converts an integer to an arbitrary precision signed integer.
-
-  when result.data is IntImpl:
-    static_check_size(T, bits)
-    when T is SomeSignedInt:
-      if n < 0:
-        # TODO: when bits >= 128, cannot create from
-        # low(int8-64)
-        # see: status-im/nim-stint/issues/92
-        assignLo(result.data, -n)
-        result = -result
-      else:
-        assignLo(result.data, n)
-    else:
-      assignLo(result.data, n)
-  else:
-    result.data = (type result.data)(n)
-
-func to*(x: SomeInteger, T: typedesc[StInt]): T =
-  stint(x, result.bits)
+    result.limbs[0] = Word(n)  
 
-func to*(x: SomeUnsignedInt, T: typedesc[StUint]): T =
-  stuint(x, result.bits)
+# func stint*[T: SomeInteger](n: T, bits: static[int]): StInt[bits] {.inline.}=
+#   ## Converts an integer to an arbitrary precision signed integer.
+#
+#   when result.data is IntImpl:
+#     static_check_size(T, bits)
+#     when T is SomeSignedInt:
+#       if n < 0:
+#         # TODO: when bits >= 128, cannot create from
+#         # low(int8-64)
+#         # see: status-im/nim-stint/issues/92
+#         assignLo(result.data, -n)
+#         result = -result
+#       else:
+#         assignLo(result.data, n)
+#     else:
+#       assignLo(result.data, n)
+#   else:
+#     result.data = (type result.data)(n)
+
+# func to*(a: SomeInteger, T: typedesc[Stint]): T =
+#   stint(a, result.bits)
+
+func to*(a: SomeUnsignedInt, T: typedesc[StUint]): T =
+  stuint(a, result.bits)
 
 func truncate*(num: StInt or StUint, T: typedesc[SomeInteger]): T {.inline.}=
   ## Extract the int, uint, int8-int64 or uint8-uint64 portion of a multi-precision integer.
   ## Note that int and uint are 32-bit on 32-bit platform.
   ## For unsigned result type, result is modulo 2^(sizeof T in bit)
   ## For signed result type, result is undefined if input does not fit in the target type.
-  static:
-    doAssert bitsof(T) <= bitsof(num.data.leastSignificantWord)
-
-  when nimvm:
-    let data = num.data.leastSignificantWord
-    vmIntCast[T](data)
-  else:
-    cast[T](num.data.leastSignificantWord)
+  result = T(num.leastSignificantWord())
 
 func toInt*(num: StInt or StUint): int {.inline, deprecated:"Use num.truncate(int) instead".}=
   num.truncate(int)
 
-func bigToSmall(result: var (UintImpl | IntImpl), x: auto) {.inline.} =
-  when bitsof(x) == bitsof(result):
-    when type(result) is type(x):
-      result = x
-    else:
-      result = convert[type(result)](x)
-  else:
-    bigToSmall(result, x.lo)
-
-func smallToBig(result: var (UintImpl | IntImpl), x: auto) {.inline.} =
-  when bitsof(x) == bitsof(result):
-    when type(result) is type(x):
-      result = x
-    else:
-      result = convert[type(result)](x)
-  else:
-    smallToBig(result.lo, x)
-
-func stuint*(x: StUint, bits: static[int]): StUint[bits] {.inline.} =
+func stuint*(a: StUint, bits: static[int]): StUint[bits] {.inline.} =
   ## unsigned int to unsigned int conversion
   ## smaller to bigger bits conversion will have the same value
   ## bigger to smaller bits conversion, the result is truncated
-  const N = bitsof(x.data)
-  when N < bits:
-    when N <= 64:
-      result = stuint(x.data, bits)
-    else:
-      smallToBig(result.data, x.data)
-  elif N > bits:
-    when bits <= 64:
-      result = stuint(x.truncate(type(result.data)), bits)
-    else:
-      bigToSmall(result.data, x.data)
-  else:
-    result = x
-
-func stuint*(x: StInt, bits: static[int]): StUint[bits] {.inline.} =
-  ## signed int to unsigned int conversion
-  ## current behavior is cast-like, copying bit pattern
-  ## or truncating if input does not fit into destination
-  const N = bitsof(x.data)
-  when N < bits:
-    when N <= 64:
-      type T = StUint[N]
-      result = stuint(convert[T](x).data, bits)
-    else:
-      smallToBig(result.data, x.data)
-  elif N > bits:
-    when bits <= 64:
-      result = stuint(x.truncate(type(result.data)), bits)
-    else:
-      bigToSmall(result.data, x.data)
-  else:
-    result = convert[type(result)](x)
-
-func stint*(x: StInt, bits: static[int]): StInt[bits] {.inline.} =
-  ## signed int to signed int conversion
-  ## will raise exception if input does not fit into destination
-  const N = bitsof(x.data)
-  when N < bits:
-    when N <= 64:
-      result = stint(x.data, bits)
-    else:
-      if x.isNegative:
-        smallToBig(result.data, (-x).data)
-        result = -result
-      else:
-        smallToBig(result.data, x.data)
-  elif N > bits:
-    template checkNegativeRange() =
-      # due to bug #92, we skip negative range check
-      when false:
-        const dmin = stint((type result).low, N)
-        if x < dmin: raise newException(ValueError, "value out of range")
-
-    template checkPositiveRange() =
-      const dmax = stint((type result).high, N)
-      if x > dmax: raise newException(ValueError, "value out of range")
-
-    when bits <= 64:
-      if x.isNegative:
-        checkNegativeRange()
-        result = stint((-x).truncate(type(result.data)), bits)
-        result = -result
-      else:
-        checkPositiveRange()
-        result = stint(x.truncate(type(result.data)), bits)
-    else:
-      if x.isNegative:
-        checkNegativeRange()
-        bigToSmall(result.data, (-x).data)
-        result = -result
-      else:
-        checkPositiveRange()
-        bigToSmall(result.data, x.data)
-  else:
-    result = x
-
-func stint*(x: StUint, bits: static[int]): StInt[bits] {.inline.} =
-  const N = bitsof(x.data)
-  const dmax = stuint((type result).high, N)
-  if x > dmax: raise newException(ValueError, "value out of range")
-  when N < bits:
-    when N <= 64:
-      result = stint(x.data, bits)
-    else:
-      smallToBig(result.data, x.data)
-  elif N > bits:
-    when bits <= 64:
-      result = stint(x.truncate(type(result.data)), bits)
-    else:
-      bigToSmall(result.data, x.data)
-  else:
-    result = convert[type(result)](x)
+  for i in 0 ..< result.len:
+    result[i] = a[i]
+
+# func StUint*(a: StInt, bits: static[int]): StUint[bits] {.inline.} =
+#   ## signed int to unsigned int conversion
+#   ## current behavior is cast-like, copying bit pattern
+#   ## or truncating if input does not fit into destination
+#   const N = bitsof(x.data)
+#   when N < bits:
+#     when N <= 64:
+#       type T = StUint[N]
+#       result = StUint(convert[T](a).data, bits)
+#     else:
+#       smallToBig(result.data, a.data)
+#   elif N > bits:
+#     when bits <= 64:
+#       result = StUint(x.truncate(type(result.data)), bits)
+#     else:
+#       bigToSmall(result.data, a.data)
+#   else:
+#     result = convert[type(result)](a)
+
+# func stint*(a: StInt, bits: static[int]): StInt[bits] {.inline.} =
+#   ## signed int to signed int conversion
+#   ## will raise exception if input does not fit into destination
+#   const N = bitsof(a.data)
+#   when N < bits:
+#     when N <= 64:
+#       result = stint(a.data, bits)
+#     else:
+#       if a.isNegative:
+#         smallToBig(result.data, (-a).data)
+#         result = -result
+#       else:
+#         smallToBig(result.data, a.data)
+#   elif N > bits:
+#     template checkNegativeRange() =
+#       # due to bug #92, we skip negative range check
+#       when false:
+#         const dmin = stint((type result).low, N)
+#         if a < dmin: raise newException(RangeError, "value out of range")
+
+#     template checkPositiveRange() =
+#       const dmax = stint((type result).high, N)
+#       if a > dmax: raise newException(RangeError, "value out of range")
+
+#     when bits <= 64:
+#       if a.isNegative:
+#         checkNegativeRange()
+#         result = stint((-a).truncate(type(result.data)), bits)
+#         result = -result
+#       else:
+#         checkPositiveRange()
+#         result = stint(a.truncate(type(result.data)), bits)
+#     else:
+#       if a.isNegative:
+#         checkNegativeRange()
+#         bigToSmall(result.data, (-a).data)
+#         result = -result
+#       else:
+#         checkPositiveRange()
+#         bigToSmall(result.data, a.data)
+#   else:
+#     result = a
+
+# func stint*(a: StUint, bits: static[int]): StInt[bits] {.inline.} =
+#   const N = bitsof(a.data)
+#   const dmax = StUint((type result).high, N)
+#   if a > dmax: raise newException(RangeError, "value out of range")
+#   when N < bits:
+#     when N <= 64:
+#       result = stint(a.data, bits)
+#     else:
+#       smallToBig(result.data, a.data)
+#   elif N > bits:
+#     when bits <= 64:
+#       result = stint(a.truncate(type(result.data)), bits)
+#     else:
+#       bigToSmall(result.data, a.data)
+#   else:
+#     result = convert[type(result)](a)
 
 func readHexChar(c: char): int8 {.inline.}=
   ## Converts an hex char to an int
@@ -209,8 +184,6 @@ func readHexChar(c: char): int8 {.inline.}=
 func skipPrefixes(current_idx: var int, str: string, radix: range[2..16]) {.inline.} =
   ## Returns the index of the first meaningful char in `hexStr` by skipping
   ## "0x" prefix
-  # Always called from a context where radix is known at compile-time
-  # and checked within 2..16 and so cannot throw a RangeDefect at runtime
 
   if str.len < 2:
     return
@@ -218,20 +191,14 @@ func skipPrefixes(current_idx: var int, str: string, radix: range[2..16]) {.inli
   doAssert current_idx == 0, "skipPrefixes only works for prefixes (position 0 and 1 of the string)"
   if str[0] == '0':
     if str[1] in {'x', 'X'}:
-      if radix == 16:
-        current_idx = 2
-      else:
-        raise newException(ValueError,"Parsing mismatch, 0x prefix is only valid for a hexadecimal number (base 16)")
+      doAssert radix == 16, "Parsing mismatch, 0x prefix is only valid for a hexadecimal number (base 16)"
+      current_idx = 2
     elif str[1] in {'o', 'O'}:
-      if radix == 8:
-        current_idx = 2
-      else:
-        raise newException(ValueError, "Parsing mismatch, 0o prefix is only valid for an octal number (base 8)")
+      doAssert radix == 8, "Parsing mismatch, 0o prefix is only valid for an octal number (base 8)"
+      current_idx = 2
     elif str[1] in {'b', 'B'}:
-      if radix == 2:
-        current_idx = 2
-      elif radix != 16:
-        raise newException(ValueError, "Parsing mismatch, 0b prefix is only valid for a binary number (base 2) or as first byte of a hexadecimal number (base 16)")
+      doAssert radix == 2, "Parsing mismatch, 0b prefix is only valid for a binary number (base 2)"
+      current_idx = 2
 
 func nextNonBlank(current_idx: var int, s: string) {.inline.} =
   ## Move the current index, skipping white spaces and "_" characters.
@@ -242,15 +209,13 @@ func nextNonBlank(current_idx: var int, s: string) {.inline.} =
   while current_idx < s.len and s[current_idx] in blanks:
     inc current_idx
 
-func readDecChar(c: char): int {.inline.}=
+func readDecChar(c: range['0'..'9']): int {.inline.}=
   ## Converts a decimal char to an int
   # specialization without branching for base <= 10.
-  if c notin {'0'..'9'}:
-    raise newException(ValueError, "Character out of '0'..'9' range")
   ord(c) - ord('0')
 
 func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix: static[uint8] = 10): T =
-  ## Parse a string and store the result in a StInt[bits] or StUint[bits].
+  ## Parse a string and store the result in a Stint[bits] or StUint[bits].
 
   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
@@ -270,43 +235,43 @@ func parse*[bits: static[int]](input: string, T: typedesc[StUint[bits]], radix:
       result = result * base + input[curr].readHexChar.stuint(bits)
     nextNonBlank(curr, input)
 
-func parse*[bits: static[int]](input: string, T: typedesc[StInt[bits]], radix: static[int8] = 10): T =
-  ## Parse a string and store the result in a StInt[bits] or StUint[bits].
+# func parse*[bits: static[int]](input: string, T: typedesc[Stint[bits]], radix: static[int8] = 10): T =
+#   ## Parse a string and store the result in a Stint[bits] or StUint[bits].
 
-  static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
-  # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
+#   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
+#   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
 
-  # TODO: we can special case hex result/input as an array of bytes
-  #       and be much faster
+#   # TODO: we can special case hex result/input as an array of bytes
+#   #       and be much faster
 
-  # For conversion we require overflowing operations (for example for negative hex numbers)
-  const base = radix.int8.stuint(bits)
+#   # For conversion we require overflowing operations (for example for negative hex numbers)
+#   const base = radix.int8.StUint(bits)
 
-  var
-    curr = 0 # Current index in the string
-    isNeg = false
-    no_overflow: StUint[bits]
+#   var
+#     curr = 0 # Current index in the string
+#     isNeg = false
+#     no_overflow: StUint[bits]
 
-  if input[curr] == '-':
-    doAssert radix == 10, "Negative numbers are only supported with base 10 input."
-    isNeg = true
-    inc curr
-  else:
-    skipPrefixes(curr, input, radix)
+#   if input[curr] == '-':
+#     doAssert radix == 10, "Negative numbers are only supported with base 10 input."
+#     isNeg = true
+#     inc curr
+#   else:
+#     skipPrefixes(curr, input, radix)
 
-  while curr < input.len:
-    # TODO: overflow detection
-    when radix <= 10:
-      no_overflow = no_overflow * base + input[curr].readDecChar.stuint(bits)
-    else:
-      no_overflow = no_overflow * base + input[curr].readHexChar.stuint(bits)
-    nextNonBlank(curr, input)
+#   while curr < input.len:
+#     # TODO: overflow detection
+#     when radix <= 10:
+#       no_overflow = no_overflow * base + input[curr].readDecChar.StUint(bits)
+#     else:
+#       no_overflow = no_overflow * base + input[curr].readHexChar.StUint(bits)
+#     nextNonBlank(curr, input)
 
-  # TODO: we can't create the lowest int this way
-  if isNeg:
-    result = -convert[T](no_overflow)
-  else:
-    result = convert[T](no_overflow)
+#   # TODO: we can't create the lowest int this way
+#   if isNeg:
+#     result = -convert[T](no_overflow)
+#   else:
+#     result = convert[T](no_overflow)
 
 func fromHex*(T: typedesc[StUint|StInt], s: string): T {.inline.} =
   ## Convert an hex string to the corresponding unsigned integer
@@ -317,7 +282,7 @@ func hexToUint*[bits: static[int]](hexString: string): StUint[bits] {.inline.} =
   parse(hexString, type result, radix = 16)
 
 func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10): string =
-  ## Convert a StInt or StUint to string.
+  ## Convert a Stint or StUint to string.
   ## In case of negative numbers:
   ##   - they are prefixed with "-" for base 10.
   ##   - if not base 10, they are returned raw in two-complement form.
@@ -332,8 +297,8 @@ func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10):
   var (q, r) = divmod(num, base)
 
   while true:
-    when bitsof(r.data) <= 64:
-      result.add hexChars[r.data.int]
+    when bits <= 64:
+      result.add hexChars[r.leastSignificantWord()]
     else:
       result.add hexChars[r.truncate(int)]
     if q.isZero:
@@ -342,46 +307,46 @@ func toString*[bits: static[int]](num: StUint[bits], radix: static[uint8] = 10):
 
   reverse(result)
 
-func toString*[bits: static[int]](num: StInt[bits], radix: static[int8] = 10): string =
-  ## Convert a StInt or StUint to string.
-  ## In case of negative numbers:
-  ##   - they are prefixed with "-" for base 10.
-  ##   - if not base 10, they are returned raw in two-complement form.
+# func toString*[bits: static[int]](num: Stint[bits], radix: static[int8] = 10): string =
+#   ## Convert a Stint or StUint to string.
+#   ## In case of negative numbers:
+#   ##   - they are prefixed with "-" for base 10.
+#   ##   - if not base 10, they are returned raw in two-complement form.
 
-  static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
-  # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
+#   static: doAssert (radix >= 2) and radix <= 16, "Only base from 2..16 are supported"
+#   # TODO: use static[range[2 .. 16]], not supported at the moment (2018-04-26)
 
-  const hexChars = "0123456789abcdef"
-  const base = radix.int8.stuint(bits)
+#   const hexChars = "0123456789abcdef"
+#   const base = radix.int8.StUint(bits)
 
-  result = ""
+#   result = ""
 
-  type T = StUint[bits]
-  let isNeg = num.isNegative
-  let num = convert[T](if radix == 10 and isNeg: -num
-            else: num)
+#   type T = StUint[bits]
+#   let isNeg = num.isNegative
+#   let num = convert[T](if radix == 10 and isNeg: -num
+#             else: num)
 
-  var (q, r) = divmod(num, base)
+#   var (q, r) = divmod(num, base)
 
-  while true:
-    when bitsof(r.data) <= 64:
-      result.add hexChars[r.data.int]
-    else:
-      result.add hexChars[r.truncate(int)]
-    if q.isZero:
-      break
-    (q, r) = divmod(q, base)
+#   while true:
+#     when bitsof(r.data) <= 64:
+#       result.add hexChars[r.data.int]
+#     else:
+#       result.add hexChars[r.truncate(int)]
+#     if q.isZero:
+#       break
+#     (q, r) = divmod(q, base)
 
-  if isNeg and radix == 10:
-    result.add '-'
+#   if isNeg and radix == 10:
+#     result.add '-'
 
-  reverse(result)
+#   reverse(result)
 
-func `$`*(num: StInt or StUint): string {.inline.}=
-  when num.data is SomeInteger:
-    $num.data
-  else:
-    toString(num, 10)
+# func `$`*(num: Stint or StUint): string {.inline.}=
+#   when num.data is SomeInteger:
+#     $num.data
+#   else:
+#     toString(num, 10)
 
 func toHex*[bits: static[int]](num: StInt[bits] or StUint[bits]): string {.inline.}=
   ## Convert to a hex string.
@@ -389,7 +354,7 @@ func toHex*[bits: static[int]](num: StInt[bits] or StUint[bits]): string {.inlin
   ## Leading zeros are stripped. Use dumpHex instead if you need the in-memory representation
   toString(num, 16)
 
-func dumpHex*(x: StInt or StUint, order: static[Endianness] = bigEndian): string =
+func dumpHex*(a: StInt or StUint, order: static[Endianness] = bigEndian): string =
   ## Stringify an int to hex.
   ## Note. Leading zeros are not removed. Use toString(n, base = 16)/toHex instead.
   ##
@@ -401,127 +366,26 @@ func dumpHex*(x: StInt or StUint, order: static[Endianness] = bigEndian): string
   ## in littleEndian:
   ## - 1.uint64 will be 01000000
   ## - (2.uint128)^64 + 1 will be 0100000001000000
+  let bytes = a.toBytes(order)
+  result = bytes.toHex()
 
-  const
-    hexChars = "0123456789abcdef"
-    size = bitsof(x.data) div 8
-
-  result = newString(2*size)
+export fromBytes, toBytes
 
-  when nimvm:
-    for i in 0 ..< size:
-      when order == system.cpuEndian:
-        let byte = x.data.getByte(i)
-      else:
-        let byte = x.data.getByte(size - 1 - i)
-      result[2*i] = hexChars[int byte shr 4 and 0xF]
-      result[2*i+1] = hexChars[int byte and 0xF]
-  else:
-    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-    let bytes {.restrict.}= cast[ptr array[size, byte]](x.unsafeAddr)
-
-    for i in 0 ..< size:
-      when order == system.cpuEndian:
-        result[2*i] = hexChars[int bytes[i] shr 4 and 0xF]
-        result[2*i+1] = hexChars[int bytes[i] and 0xF]
-      else:
-        result[2*i] = hexChars[int bytes[bytes[].high - i] shr 4 and 0xF]
-        result[2*i+1] = hexChars[int bytes[bytes[].high - i] and 0xF]
-
-proc initFromBytesBE*[bits: static[int]](val: var StUint[bits], ba: openArray[byte], allowPadding: static[bool] = true) =
-  ## Initializes a UInt[bits] value from a byte buffer storing a big-endian
-  ## representation of a number.
-  ##
-  ## If `allowPadding` is set to false, the input array must be exactly
-  ## (bits div 8) bytes long. Otherwise, it may be shorter and the remaining
-  ## bytes will be assumed to be zero.
-
-  const N = bits div 8
-
-  when not allowPadding:
-    doAssert(ba.len == N)
-  else:
-    doAssert ba.len <= N
-    when system.cpuEndian == bigEndian:
-      let baseIdx = N - val.len
-    else:
-      let baseIdx = ba.len - 1
-
-  when nimvm:
-    when system.cpuEndian == bigEndian:
-      when allowPadding:
-        for i, b in ba: val.data.setByte(baseIdx + i, b)
-      else:
-        for i, b in ba: val.data.setByte(i, b)
-    else:
-      when allowPadding:
-        for i, b in ba: val.data.setByte(baseIdx - i, b)
-      else:
-        for i, b in ba: val.data.setByte(N-1 - i, b)
-  else:
-    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-    let r_ptr {.restrict.} = cast[ptr array[N, byte]](val.addr)
-
-    when system.cpuEndian == bigEndian:
-      # TODO: due to https://github.com/status-im/nim-stint/issues/38
-      # We can't cast a stack byte array to stuint with a convenient proc signature.
-      when allowPadding:
-        for i, b in ba: r_ptr[baseIdx + i] = b
-      else:
-        for i, b in ba: r_ptr[i] = b
-    else:
-      when allowPadding:
-        for i, b in ba: r_ptr[baseIdx - i] = b
-      else:
-        for i, b in ba: r_ptr[N-1 - i] = b
-
-func significantBytesBE*(val: openArray[byte]): int {.deprecated.}=
-  ## Returns the number of significant trailing bytes in a big endian
-  ## representation of a number.
-  # TODO: move that in https://github.com/status-im/nim-byteutils
-  for i in 0 ..< val.len:
-    if val[i] != 0:
-      return val.len - i
-  return 1
-
-func fromBytesBE*(T: type StUint, ba: openArray[byte],
-                  allowPadding: static[bool] = true): T =
-  ## This function provides a convenience wrapper around `initFromBytesBE`.
-  result.initFromBytesBE(ba, allowPadding)
-
-func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] =
+func readUintBE*[bits: static[int]](ba: openArray[byte]): StUint[bits] {.noinit, inline.}=
   ## Convert a big-endian array of (bits div 8) Bytes to an UInt[bits] (in native host endianness)
   ## Input:
   ##   - a big-endian openArray of size (bits div 8) at least
   ## Returns:
   ##   - A unsigned integer of the same size with `bits` bits
-  ##
-  ## ⚠ If the openArray length is bigger than bits div 8, part converted is undefined behaviour.
-  result.initFromBytesBE(ba, false)
+  result = (typeof result).fromBytesBE(ba)
 
-func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] =
+func toByteArrayBE*[bits: static[int]](n: StUint[bits]): array[bits div 8, byte] {.noinit, inline.}=
   ## Convert a uint[bits] to to a big-endian array of bits div 8 bytes
   ## Input:
   ##   - an unsigned integer
   ## Returns:
   ##   - a big-endian array of the same size
-
-  const N = bits div 8
-
-  when nimvm:
-    for i in 0 ..< N:
-      when system.cpuEndian == bigEndian:
-        result[i] = n.data.getByte(i)
-      else:
-        result[i] = n.data.getByte(N - 1 - i)
-  else:
-    when system.cpuEndian == bigEndian:
-      result = cast[type result](n)
-    else:
-      {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-      let n_ptr {.restrict.} = cast[ptr array[N, byte]](n.unsafeAddr)
-      for i in 0 ..< N:
-        result[N-1 - i] = n_ptr[i]
+  result = n.toBytesBE()
 
 template hash*(num: StUint|StInt): Hash =
   # TODO:
@@ -529,3 +393,10 @@ template hash*(num: StUint|StInt): Hash =
   # Explore better hashing solutions in nim-stew.
   hashData(unsafeAddr num, sizeof num)
 
+func fromBytesBE*(T: type StUint, ba: openArray[byte], allowPadding: static[bool] = true): T {.noinit, inline.}=
+  result = readUintBE[T.bits](ba)
+  when allowPadding:
+    result = result shl ((sizeof(T) - ba.len) * 8)
+
+template initFromBytesBE*(x: var StUint, ba: openArray[byte], allowPadding: static[bool] = true) =
+  x = fromBytesBE(type x, ba, allowPadding)
diff --git a/stint/lenient_stint.nim b/stint/lenient_stint.nim
index d7faa8d..c463ada 100644
--- a/stint/lenient_stint.nim
+++ b/stint/lenient_stint.nim
@@ -11,6 +11,8 @@
 
 import ./int_public, ./uint_public, macros
 
+# TODO: deprecate
+
 type Signedness = enum
   BothSigned, IntOnly, UintOnly
 
diff --git a/stint/literals_stint.nim b/stint/literals_stint.nim
index e09e1f8..40a8431 100644
--- a/stint/literals_stint.nim
+++ b/stint/literals_stint.nim
@@ -9,7 +9,7 @@
 
 ## This file provides syntactic sugar to work with literals
 
-import ./intops, macros
+import ./intops, ./uintops, macros
 
 type Signedness = enum
   BothSigned, IntOnly, UintOnly
diff --git a/stint/modular_arithmetic.nim b/stint/modular_arithmetic.nim
index 4eff437..d7f2914 100644
--- a/stint/modular_arithmetic.nim
+++ b/stint/modular_arithmetic.nim
@@ -7,7 +7,7 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import ./intops, private/datatypes
+import ./uintops, private/datatypes
 
 func addmod_internal(a, b, m: StUint): StUint {.inline.}=
   ## Modular addition
diff --git a/stint/private/bitops2_priv.nim b/stint/private/bitops2_priv.nim
deleted file mode 100644
index 0c2479f..0000000
--- a/stint/private/bitops2_priv.nim
+++ /dev/null
@@ -1,58 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes, ./conversion, stew/bitops2
-export bitops2
-
-# Bitops from support library
-
-template bitsof*(x: UintImpl): int =
-  # XXX: https://github.com/nim-lang/Nim/issues/9494
-  mixin bitsof
-  bitsof(x.lo) * 2
-
-template bitsof*(x: IntImpl): int =
-  # XXX: https://github.com/nim-lang/Nim/issues/9494
-  mixin bitsof
-  bitsof(x.lo) * 2
-
-template bitsof*(x: typedesc[UintImpl]): int =
-  # XXX: https://github.com/nim-lang/Nim/issues/9494
-  mixin bitsof
-  bitsof(x.lo) * 2
-
-func countOnes*(x: UintImpl): int {.inline.} =
-  countOnes(x.lo) + countOnes(x.hi)
-
-func countZeros*(x: UintImpl): int {.inline.} =
-  countZeros(x.lo) + countOnes(x.hi)
-
-func parity*(x: UintImpl): int {.inline.} =
-  parity(x.lo) xor parity(x.hi)
-
-func leadingZeros*(x: UintImpl): int {.inline.} =
-  let tmp = x.hi.leadingZeros()
-  if tmp == bitsof(x.hi):
-    x.lo.leadingZeros() + bitsof(x.hi)
-  else:
-    tmp
-
-func trailingZeros*(x: UintImpl): int {.inline.} =
-  let tmp = x.lo.trailingZeros()
-  if tmp == bitsof(x.lo):
-    tmp + x.hi.trailingZeros()
-  else:
-    tmp
-
-func firstOne*(x: UintImpl): int {.inline.} =
-  let tmp = trailingZeros(x)
-  if tmp == bitsof(x):
-    0
-  else:
-    1 + tmp
diff --git a/stint/private/compiletime_helpers.nim b/stint/private/compiletime_helpers.nim
deleted file mode 100644
index e49e361..0000000
--- a/stint/private/compiletime_helpers.nim
+++ /dev/null
@@ -1,82 +0,0 @@
-import
-  ./datatypes, ./uint_bitwise_ops, ./bitops2_priv, ./int_bitwise_ops,
-  ./compiletime_cast
-
-export compiletime_cast
-
-func getByte*(x: SomeInteger, pos: int): byte {.compileTime.} =
-  type DT = type x
-  when bitsof(DT) == 8:
-    cast[byte](x)
-  else:
-    byte((x shr (pos * 8)) and 0xFF.DT)
-
-func getByte*(x: UintImpl | IntImpl, pos: int): byte {.compileTime.} =
-  type DT = type x.leastSignificantWord
-  when bitsof(DT) == 8:
-    cast[byte](x.leastSignificantWord)
-  else:
-    byte((x shr (pos * 8)).leastSignificantWord and 0xFF.DT)
-
-proc setByte*(x: var SomeInteger, pos: int, b: byte) {.compileTime.} =
-  type DT = type x
-  x = x or (DT(b) shl (pos*8))
-
-type SomeIntImpl = UintImpl | IntImpl
-func setByte*(x: var SomeIntImpl, pos: int, b: byte) {.compileTime.} =
-  proc putFirstByte(x: var SomeInteger, b: byte) =
-    type DT = type x
-    x = x or b.DT
-
-  proc putFirstByte(x: var UintImpl, b: byte) =
-    putFirstByte(x.lo, b)
-
-  var cx: type x
-  cx.putFirstByte(b)
-  x = x or (cx shl (pos*8))
-
-func copyToArray*(ret: var openArray[byte], x: UintImpl) {.compileTime.} =
-  const size = bitsof(x) div 8
-  doAssert ret.len >= size
-  for i in 0 ..< size:
-    ret[i] = x.getByte(i)
-
-func copyFromArray*(x: var UintImpl, data: openArray[byte]) {.compileTime.} =
-  const size = bitsof(x) div 8
-  doAssert data.len >= size
-  for i in 0 ..< size:
-    x.setByte(i, data[i])
-
-func copyFromArray*(x: var SomeInteger, data: openArray[byte]) {.compileTime.} =
-  const size = bitsof(x) div 8
-  doAssert data.len >= size
-  for i in 0 ..< size:
-    x.setByte(i, data[i])
-
-template vmIntCast*[T](data: SomeInteger): T =
-  type DT = type data
-  const
-    bits = bitsof(T)
-    DTbits = bitsof(DT)
-
-  # we use esoteric type juggling here to trick the Nim VM
-  when bits == 64:
-    when DTbits == 64:
-      cast[T](data)
-    else:
-      cast[T](uint64(data and DT(0xFFFFFFFF_FFFFFFFF)))
-  elif bits == 32:
-    when DTbits == 32:
-      cast[T](data)
-    else:
-      cast[T](uint32(data and DT(0xFFFFFFFF)))
-  elif bits == 16:
-    when DTbits == 16:
-      cast[T](data)
-    else:
-      cast[T](uint16(data and DT(0xFFFF)))
-  else:
-    when DTBits == 8:
-      cast[T](data)
-    else:
-      cast[T](uint8(data and DT(0xFF)))
diff --git a/stint/private/conversion.nim b/stint/private/conversion.nim
deleted file mode 100644
index 469e1b1..0000000
--- a/stint/private/conversion.nim
+++ /dev/null
@@ -1,59 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes
-
-func toSubtype*[T: SomeInteger](b: bool, _: typedesc[T]): T {.inline.}=
-  b.T
-
-func toSubtype*[T: UintImpl | IntImpl](b: bool, _: typedesc[T]): T {.inline.}=
-  type SubTy = type result.lo
-  result.lo = toSubtype(b, SubTy)
-
-func toUint*(n: UintImpl or IntImpl or SomeSignedInt): auto {.inline.}=
-  ## Casts an unsigned integer to an uint of the same size
-  # TODO: uint128 support
-  when n.sizeof > 8:
-    {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
-  elif n.sizeof == 8:
-    cast[uint64](n)
-  elif n.sizeof == 4:
-    cast[uint32](n)
-  elif n.sizeof == 2:
-    cast[uint16](n)
-  else:
-    cast[uint8](n)
-
-func toUint*(n: SomeUnsignedInt): SomeUnsignedInt {.inline.}=
-  ## No-op overload of multi-precision int casting
-  n
-
-func asDoubleUint*(n: UintImpl | SomeUnsignedInt): auto {.inline.} =
-  ## Convert an integer or StUint to an uint with double the size
-  type Double = (
-    when n.sizeof == 4: uint64
-    elif n.sizeof == 2: uint32
-    else: uint16
-  )
-
-  n.toUint.Double
-
-func toInt*(n: UintImpl or IntImpl or SomeInteger): auto {.inline.}=
-  ## Casts an unsigned integer to an uint of the same size
-  # TODO: uint128 support
-  when n.sizeof > 8:
-    {.fatal: "Unreachable. You are trying to cast a StUint with more than 64-bit of precision" .}
-  elif n.sizeof == 8:
-    cast[int64](n)
-  elif n.sizeof == 4:
-    cast[int32](n)
-  elif n.sizeof == 2:
-    cast[int16](n)
-  else:
-    cast[int8](n)
diff --git a/stint/private/datatypes.nim b/stint/private/datatypes.nim
index 867683b..569245c 100644
--- a/stint/private/datatypes.nim
+++ b/stint/private/datatypes.nim
@@ -7,191 +7,143 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-# TODO: test if GCC/Clang support uint128 natively
-
-# #### Overview
-#
-# Stint extends the default uint8, uint16, uint32, uint64 with power of 2 integers.
-# Only limitation is your stack size so you can have uint128, uint256, uint512 ...
-# Signed int are also possible.
-#
-# As a high-level API, Stint adheres to Nim and C conventions and uses the same operators like:
-# `+`, `xor`, `not` ...
-#
-# #### Implementation
-#
-# Stint types are stored on the stack and have a structure
-# similar to a binary tree of power of two unsigned integers
-# with "high" and "low" words:
-#
-#                              Stuint[256]
-#            hi: Stuint[128]                  lo: Stuint[128]
-#     hihi: uint64    hilo: uint64    lohi: uint64    lolo: uint64
-#
-# This follows paper https://hal.archives-ouvertes.fr/hal-00582593v2
-# "Recursive double-size fixed precision arithmetic" from Jul. 2016
-# to implement an efficient fixed precision bigint for embedded devices, especially FPGAs.
-#
-# For testing purpose, the flag `-d:stint_test` can be passed at compile-time
-# to switch the backend to uint32.
-# In the future the default backend will become uint128 on supporting compilers.
-#
-# This has following benefits:
-#   - BigEndian/LittleEndian support is trivial.
-#   - Not having for loops help the compiler producing the most efficient instructions
-#     like ADC (Add with Carry)
-#   - Proving that the recursive structure works at depth 64 for uint32 backend means that
-#     it would work at depth 128 for uint64 backend.
-#     We can easily choose a uint16 or uint8 backend as well.
-#   - Due to the recursive structure, testing operations when there is:
-#       - no leaves(uint64)
-#       - a root and leaves with no nodes (uint128)
-#       - a root + intermediate nodes + leaves (uint256)
-#     should be enough to ensure they work at all sizes, edge cases included.
-#   - Adding a new backend like uint128 (GCC/Clang) or uint256 (LLVM instrinsics only) is just adding
-#     a new case in the `uintImpl` template.
-#   - All math implementations of the operations have a straightforward translation
-#     to a high-low structure, including the fastest Karatsuba multiplication
-#     and co-recursive division algorithm by Burnikel and Ziegler.
-#     This makes translating those algorithms into Nim easier compared to an array backend.
-#     It would also probably require less code and would be much easier to audit versus
-#     the math reference papers.
-#   - For implementation of algorithms, there is no issue to take subslices of the memory representation
-#     with a recursive tree structure.
-#     On the other side, returning a `var array[N div 2, uint64]` is problematic at the moment.
-#   - Compile-time computation is possible while due to the previous issue
-#     an array backend would be required to use var openArray[uint64]
-#     i.e. pointers.
-#   - Note that while shift-right and left can easily be done an array of bytes
-#     this would have reduced performance compared to moving 64-bit words.
-#     An efficient implementation on array of words would require checking the shift
-#     versus a half-word to deal with carry-in/out from and to the adjacent words
-#     similar to a recursive implementation.
-#
-# Iterations over the whole integers, for example for `==` is always unrolled.
-# Due to being on the stack, any optimizing compiler should compile that to efficient memcmp
-#
-# When not to use Stint:
-#
-# 1. Constant-time arithmetics
-#    - Do not use Stint if you need side-channels resistance,
-#      This requires to avoid all branches (i.e. no booleans)
-# 2. Arbitrary-precision with lots of small-values
-#    - If you need arbitrary precision but most of the time you store mall values
-#      you will waste a lot of memory unless you use an object variant of various Stint sizes.
-#      type MyUint = object
-#        case kind: int
-#        of 0..64: uint64
-#        of 66..128: ref Stuint[128]
-#        of 129..256: ref Stuint[256]
-#        ...
-#
-# Note: if you work with huge size, you can allocate stints on the heap with
-#       for example `type HeapInt8192 = ref Stint[8192].
-#       If you have a lot of computations and intermediate variables it's probably worthwhile
-#       to explore creating an object pool to reuse the memory buffers.
-
-template checkDiv2(bits: static[int]): untyped =
-  # TODO: There is no need to check if power of 2 at each uintImpl instantiation, it slows compilation.
-  #       However we easily get into nested templates instantiation if we use another
-  #       template that first checks power-of-two and then calls the recursive uintImpl
-  static:
-    doAssert (bits and (bits-1)) == 0, $bits & " is not a power of 2"
-    doAssert bits >= 8, "The number of bits in a should be greater or equal to 8"
-  bits div 2
-
-when defined(mpint_test): # TODO stint_test
-  template uintImpl*(bits: static[int]): untyped =
-    # Test version, StUint[64] = 2 uint32. Test the logic of the library
-
-    when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
-    elif bits == 64: UintImpl[uint32]
-    elif bits == 32: UintImpl[uint16]
-    elif bits == 16: UintImpl[uint8]
-    else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
-
-  template intImpl*(bits: static[int]): untyped =
-    # Test version, StInt[64] = 2 uint32. Test the logic of the library
-    # int is implemented using a signed hi part and an unsigned lo part, given
-    # that the sign resides in hi
-
-    when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
-    elif bits == 64: IntImpl[int32, uint32]
-    elif bits == 32: IntImpl[int16, uint16]
-    elif bits == 16: IntImpl[int8, uint8]
-    else: {.fatal: "Only power-of-2 >=16 supported when testing" .}
-
+import
+  # Status lib
+  stew/bitops2
+
+when sizeof(int) == 8 and not defined(Stint32):
+  type
+    Word* = uint64
+    SignedWord* = int64
 else:
-  template uintImpl*(bits: static[int]): untyped =
-    mixin UintImpl
-    when bits >= 128: UintImpl[uintImpl(checkDiv2(bits))]
-    elif bits == 64: uint64
-    elif bits == 32: uint32
-    elif bits == 16: uint16
-    elif bits == 8: uint8
-    else: {.fatal: "Only power-of-2 >=8 supported" .}
-
-  template intImpl*(bits: static[int]): untyped =
-    # int is implemented using a signed hi part and an unsigned lo part, given
-    # that the sign resides in hi
-
-    when bits >= 128: IntImpl[intImpl(checkDiv2(bits)), uintImpl(checkDiv2(bits))]
-    elif bits == 64: int64
-    elif bits == 32: int32
-    elif bits == 16: int16
-    elif bits == 8: int8
-    else: {.fatal: "Only power-of-2 >=8 supported" .}
+  type
+    Word* = uint32
+    SignedWord* = int32
 
-type
-  # ### Private ### #
-  UintImpl*[BaseUint] = object
-    when system.cpuEndian == littleEndian:
-      lo*, hi*: BaseUint
-    else:
-      hi*, lo*: BaseUint
+const WordBitWidth* = sizeof(Word) * 8
 
-  IntImpl*[BaseInt, BaseUint] = object
-    # Ints are implemented in terms of uints
-    when system.cpuEndian == littleEndian:
-      lo*: BaseUint
-      hi*: BaseInt
-    else:
-      hi*: BaseInt
-      lo*: BaseUint
+func wordsRequired*(bits: int): int {.compileTime.} =
+  ## Compute the number of limbs required
+  ## for the **announced** bit length
+  (bits + WordBitWidth - 1) div WordBitWidth
 
-  # ### Private ### #
+type
+  Limbs*[N: static int] = array[N, Word]
+    ## Limbs type
 
   StUint*[bits: static[int]] = object
-    data*: uintImpl(bits)
+    ## Stack-based integer
+    ## Unsigned
+    limbs*: array[bits.wordsRequired, Word]
+      # Limbs-Endianess is little-endian
 
   StInt*[bits: static[int]] = object
-    data*: intImpl(bits)
-
-template applyHiLo*(a: UintImpl | IntImpl, c: untyped): untyped =
-  ## Apply `c` to each of `hi` and `lo`
-  var res: type a
-  res.hi = c(a.hi)
-  res.lo = c(a.lo)
-  res
-
-template applyHiLo*(a, b: UintImpl | IntImpl, c: untyped): untyped =
-  ## Apply `c` to each of `hi` and `lo`
-  var res: type a
-  res.hi = c(a.hi, b.hi)
-  res.lo = c(a.lo, b.lo)
-  res
-
-template leastSignificantWord*(num: SomeInteger): auto =
-  num
-
-func leastSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
-  when num.lo is UintImpl:
-    num.lo.leastSignificantWord
-  else:
-    num.lo
-
-func mostSignificantWord*(num: UintImpl | IntImpl): auto {.inline.} =
-  when num.hi is (UintImpl | IntImpl):
-    num.hi.mostSignificantWord
-  else:
-    num.hi
+    ## Stack-based integer
+    ## Signed
+    limbs*: array[bits.wordsRequired, Word]
+
+  # {.borrow: `.`.} only works with nim-devel
+  # StInt*[bits: static[int]] {.borrow: `.`.} = distinct StUint[bits]
+    ## Stack-based integer
+    ## Signed
+
+  Carry* = uint8  # distinct range[0'u8 .. 1]
+  Borrow* = uint8 # distinct range[0'u8 .. 1]
+
+  SomeBigInteger*[bits: static[int]] = StUint[bits] | StInt[bits]
+
+const GCC_Compatible* = defined(gcc) or defined(clang) or defined(llvm_gcc)
+const X86* = defined(amd64) or defined(i386)
+
+when sizeof(int) == 8 and GCC_Compatible:
+  type
+    uint128*{.importc: "unsigned __int128".} = object
+
+# Bithacks
+# --------------------------------------------------------
+
+{.push raises: [], inline, noinit, gcsafe.}
+
+template clearExtraBitsOverMSB*(a: var StUint) =
+  ## A Stuint is stored in an array of 32 of 64-bit word
+  ## If we do bit manipulation at the word level,
+  ## for example a 8-bit stuint stored in a 64-bit word
+  ## we need to clear the upper 56-bit
+  when a.bits != a.limbs.len * WordBitWidth:
+    const posExtraBits = a.bits - (a.limbs.len-1) * WordBitWidth
+    const mask = (Word(1) shl posExtraBits) - 1
+    a[^1] = a[^1] and mask
+
+func usedBitsAndWords*(a: openArray[Word]): tuple[bits, words: int] =
+  ## Returns the number of used words and bits in a bigInt
+  ## Returns (0, 0) for all-zeros array (even if technically you need 1 bit and 1 word to encode zero)
+  var clz = 0
+  # Count Leading Zeros
+  for i in countdown(a.len-1, 0):
+    let count = log2trunc(a[i])
+    # debugEcho "count: ", count, ", a[", i, "]: ", a[i].toBin(64)
+    if count == -1:
+      clz += WordBitWidth
+    else:
+      clz += WordBitWidth - count - 1
+      return (a.len*WordBitWidth - clz, i+1)
+  return (0, 0)
+
+{.pop.}
+
+# Accessors
+# --------------------------------------------------------
+
+template `[]`*(a: SomeBigInteger, i: SomeInteger or BackwardsIndex): Word =
+  a.limbs[i]
+
+template `[]=`*(a: var SomeBigInteger, i: SomeInteger or BackwardsIndex, val: Word) =
+  a.limbs[i] = val
+
+# Iterations
+# --------------------------------------------------------
+
+import std/macros
+
+proc replaceNodes(ast: NimNode, what: NimNode, by: NimNode): NimNode =
+  # Replace "what" ident node by "by"
+  proc inspect(node: NimNode): NimNode =
+    case node.kind:
+    of {nnkIdent, nnkSym}:
+      if node.eqIdent(what):
+        return by
+      return node
+    of nnkEmpty:
+      return node
+    of nnkLiterals:
+      return node
+    else:
+      var rTree = node.kind.newTree()
+      for child in node:
+        rTree.add inspect(child)
+      return rTree
+  result = inspect(ast)
+
+macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped): untyped =
+  ## staticFor [min inclusive, max exclusive)
+  result = newStmtList()
+  for i in start ..< stopEx:
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i)
+    )
+
+# Copy
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func copyWords*(
+       a: var openArray[Word], startA: int,
+       b: openArray[Word], startB: int,
+       numWords: int) =
+  ## Copy a slice of B into A. This properly deals
+  ## with overlaps when A and B are slices of the same buffer
+  for i in countdown(numWords-1, 0):
+    a[startA+i] = b[startB+i]
+
+{.pop.}
diff --git a/stint/private/endians2_priv.nim b/stint/private/endians2_priv.nim
deleted file mode 100644
index fb02b1a..0000000
--- a/stint/private/endians2_priv.nim
+++ /dev/null
@@ -1,26 +0,0 @@
-import ./bitops2_priv, ./datatypes, ./compiletime_helpers
-import stew/endians2
-export endians2
-
-func swapBytes*(x: UintImpl): UintImpl {.inline.} =
-  let lo = swapBytes(x.hi)
-  let hi = swapBytes(x.lo)
-
-  UintImpl(hi: hi, lo: lo)
-
-func toBytes*(x: UintImpl, endian: Endianness = system.cpuEndian): auto {.inline.} =
-  # TODO can't use bitsof in return type (compiler bug?), hence return auto
-  var ret: array[bitsof(x) div 8, byte]
-  when nimvm:
-    if endian == system.cpuEndian:
-      copyToArray(ret, x)
-    else:
-      let v = swapBytes(x)
-      copyToArray(ret, v)
-  else:
-    if endian == system.cpuEndian:
-      copyMem(addr ret[0], unsafeAddr x, ret.len)
-    else:
-      let v = swapBytes(x)
-      copyMem(addr ret[0], unsafeAddr v, ret.len)
-  ret
diff --git a/stint/private/initialization.nim b/stint/private/initialization.nim
deleted file mode 100644
index 1c4b108..0000000
--- a/stint/private/initialization.nim
+++ /dev/null
@@ -1,19 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./datatypes
-
-func zero*(T: typedesc): T {.inline.} =
-  discard
-
-func one*(T: typedesc[SomeInteger]): T {.inline.} =
-  1
-
-func one*(T: typedesc[UintImpl or IntImpl]): T {.inline.} =
-  result.lo = one(type result.lo)
diff --git a/stint/private/primitives/addcarry_subborrow.nim b/stint/private/primitives/addcarry_subborrow.nim
new file mode 100644
index 0000000..3751602
--- /dev/null
+++ b/stint/private/primitives/addcarry_subborrow.nim
@@ -0,0 +1,186 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes, ./compiletime_fallback
+
+# ############################################################
+#
+#            Add-with-carry and Sub-with-borrow
+#
+# ############################################################
+#
+# This file implements add-with-carry and sub-with-borrow
+#
+# It is currently (Mar 2020) impossible to have the compiler
+# generate optimal code in a generic way.
+#
+# On x86, addcarry_u64 intrinsic will generate optimal code
+# except for GCC.
+#
+# On other CPU architectures inline assembly might be desirable.
+# A compiler proof-of-concept is available in the "research" folder.
+#
+# See https://gcc.godbolt.org/z/2h768y
+# ```C
+# #include <stdint.h>
+# #include <x86intrin.h>
+#
+# void add256(uint64_t a[4], uint64_t b[4]){
+#   uint8_t carry = 0;
+#   for (int i = 0; i < 4; ++i)
+#     carry = _addcarry_u64(carry, a[i], b[i], &a[i]);
+# }
+# ```
+#
+# GCC
+# ```asm
+# add256:
+#         movq    (%rsi), %rax
+#         addq    (%rdi), %rax
+#         setc    %dl
+#         movq    %rax, (%rdi)
+#         movq    8(%rdi), %rax
+#         addb    $-1, %dl
+#         adcq    8(%rsi), %rax
+#         setc    %dl
+#         movq    %rax, 8(%rdi)
+#         movq    16(%rdi), %rax
+#         addb    $-1, %dl
+#         adcq    16(%rsi), %rax
+#         setc    %dl
+#         movq    %rax, 16(%rdi)
+#         movq    24(%rsi), %rax
+#         addb    $-1, %dl
+#         adcq    %rax, 24(%rdi)
+#         ret
+# ```
+#
+# Clang
+# ```asm
+# add256:
+#         movq    (%rsi), %rax
+#         addq    %rax, (%rdi)
+#         movq    8(%rsi), %rax
+#         adcq    %rax, 8(%rdi)
+#         movq    16(%rsi), %rax
+#         adcq    %rax, 16(%rdi)
+#         movq    24(%rsi), %rax
+#         adcq    %rax, 24(%rdi)
+#         retq
+# ```
+
+# ############################################################
+#
+#                     Intrinsics
+#
+# ############################################################
+
+# Note: GCC before 2017 had incorrect codegen in some cases:
+# - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81300
+
+when X86:
+  when defined(windows):
+    {.pragma: intrinsics, header:"<intrin.h>", nodecl.}
+  else:
+    {.pragma: intrinsics, header:"<x86intrin.h>", nodecl.}
+
+  func addcarry_u32(carryIn: Carry, a, b: uint32, sum: var uint32): Carry {.importc: "_addcarry_u32", intrinsics.}
+  func subborrow_u32(borrowIn: Borrow, a, b: uint32, diff: var uint32): Borrow {.importc: "_subborrow_u32", intrinsics.}
+
+  func addcarry_u64(carryIn: Carry, a, b: uint64, sum: var uint64): Carry {.importc: "_addcarry_u64", intrinsics.}
+  func subborrow_u64(borrowIn: Borrow, a, b:uint64, diff: var uint64): Borrow {.importc: "_subborrow_u64", intrinsics.}
+
+# ############################################################
+#
+#                     Public
+#
+# ############################################################
+
+func addC*(cOut: var Carry, sum: var uint32, a, b: uint32, cIn: Carry) {.inline.} =
+  ## Addition with carry
+  ## (CarryOut, Sum) <- a + b + CarryIn
+  when nimvm:
+    let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
+    sum = uint32(dblPrec and uint32.high)
+    cOut = Carry(dblPrec shr 32)
+  else:
+    when X86:
+      cOut = addcarry_u32(cIn, a, b, sum)
+    else:
+      let dblPrec = uint64(cIn) + uint64(a) + uint64(b)
+      sum = uint32(dblPrec)
+      cOut = Carry(dblPrec shr 32)
+
+func subB*(bOut: var Borrow, diff: var uint32, a, b: uint32, bIn: Borrow) {.inline.} =
+  ## Substraction with borrow
+  ## (BorrowOut, Diff) <- a - b - borrowIn
+  when nimvm:
+    let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
+    diff = uint32(dblPrec and uint32.high)
+    # On borrow the high word will be 0b1111...1111 and needs to be masked
+    bOut = Borrow((dblPrec shr 32) and 1)
+  else:
+    when X86:
+      bOut = subborrow_u32(bIn, a, b, diff)
+    else:
+      let dblPrec = uint64(a) - uint64(b) - uint64(bIn)
+      diff = uint32(dblPrec)
+      # On borrow the high word will be 0b1111...1111 and needs to be masked
+      bOut = Borrow((dblPrec shr 32) and 1)
+
+func addC*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) {.inline.} =
+  ## Addition with carry
+  ## (CarryOut, Sum) <- a + b + CarryIn
+  when nimvm:
+    addC_nim(cOut, sum, a, b, cIn)
+  else:
+    when X86:
+      cOut = addcarry_u64(cIn, a, b, sum)
+    else:
+      block:
+        static:
+          doAssert GCC_Compatible
+          doAssert sizeof(int) == 8
+
+        var dblPrec {.noInit.}: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a," + (unsigned __int128)", b, " + (unsigned __int128)",cIn,";"].}
+
+        # Don't forget to dereference the var param in C mode
+        when defined(cpp):
+          {.emit:[cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+          {.emit:[sum, " = (NU64)", dblPrec,";"].}
+        else:
+          {.emit:["*",cOut, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+          {.emit:["*",sum, " = (NU64)", dblPrec,";"].}
+
+func subB*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) {.inline.} =
+  ## Substraction with borrow
+  ## (BorrowOut, Diff) <- a - b - borrowIn
+  when nimvm:
+    subB_nim(bOut, diff, a, b, bIn)
+  else:
+    when X86:
+      bOut = subborrow_u64(bIn, a, b, diff)
+    else:
+      block:
+        static:
+          doAssert GCC_Compatible
+          doAssert sizeof(int) == 8
+
+        var dblPrec {.noInit.}: uint128
+        {.emit:[dblPrec, " = (unsigned __int128)", a," - (unsigned __int128)", b, " - (unsigned __int128)",bIn,";"].}
+
+        # Don't forget to dereference the var param in C mode
+        # On borrow the high word will be 0b1111...1111 and needs to be masked
+        when defined(cpp):
+          {.emit:[bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+          {.emit:[diff, " = (NU64)", dblPrec,";"].}
+        else:
+          {.emit:["*",bOut, " = (NU64)(", dblPrec," >> ", 64'u64, ") & 1;"].}
+          {.emit:["*",diff, " = (NU64)", dblPrec,";"].}
diff --git a/stint/private/primitives/compiletime_fallback.nim b/stint/private/primitives/compiletime_fallback.nim
new file mode 100644
index 0000000..daf65c7
--- /dev/null
+++ b/stint/private/primitives/compiletime_fallback.nim
@@ -0,0 +1,154 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#                 VM fallback for uint64
+#
+# ############################################################
+
+const
+  uint64BitWidth = 64
+  HalfWidth = uint64BitWidth shr 1
+  HalfBase = 1'u64 shl HalfWidth
+  HalfMask = HalfBase - 1
+
+func hi(n: uint64): uint64 =
+  result = n shr HalfWidth
+
+func lo(n: uint64): uint64 =
+  result = n and HalfMask
+
+func split(n: uint64): tuple[hi, lo: uint64] =
+  result.hi = n.hi
+  result.lo = n.lo
+
+func merge(hi, lo: uint64): uint64 =
+  (hi shl HalfWidth) or lo
+
+func addC_nim*(cOut: var Carry, sum: var uint64, a, b: uint64, cIn: Carry) =
+  # Add with carry, fallback for the Compile-Time VM
+  # (CarryOut, Sum) <- a + b + CarryIn
+  let (aHi, aLo) = split(a)
+  let (bHi, bLo) = split(b)
+  let tLo = aLo + bLo + cIn
+  let (cLo, rLo) = split(tLo)
+  let tHi = aHi + bHi + cLo
+  let (cHi, rHi) = split(tHi)
+  cOut = Carry(cHi)
+  sum = merge(rHi, rLo)
+
+func subB_nim*(bOut: var Borrow, diff: var uint64, a, b: uint64, bIn: Borrow) =
+  # Substract with borrow, fallback for the Compile-Time VM
+  # (BorrowOut, Sum) <- a - b - BorrowIn
+  let (aHi, aLo) = split(a)
+  let (bHi, bLo) = split(b)
+  let tLo = HalfBase + aLo - bLo - bIn
+  let (noBorrowLo, rLo) = split(tLo)
+  let tHi = HalfBase + aHi - bHi - uint64(noBorrowLo == 0)
+  let (noBorrowHi, rHi) = split(tHi)
+  bOut = Borrow(noBorrowHi == 0)
+  diff = merge(rHi, rLo)
+
+func mul_nim*(hi, lo: var uint64, u, v: uint64) =
+  ## Extended precision multiplication
+  ## (hi, lo) <- u * v
+  var x0, x1, x2, x3: uint64
+
+  let
+    (uh, ul) = u.split()
+    (vh, vl) = v.split()
+
+  x0 = ul * vl
+  x1 = ul * vh
+  x2 = uh * vl
+  x3 = uh * vh
+
+  x1 += hi(x0)          # This can't carry
+  x1 += x2              # but this can
+  if x1 < x2:           # if carry, add it to x3
+    x3 += HalfBase
+
+  hi = x3 + hi(x1)
+  lo = merge(x1, lo(x0))
+
+func muladd1_nim*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  var carry: Carry
+  mul_nim(hi, lo, a, b)
+  addC_nim(carry, lo, lo, c, 0)
+  addC_nim(carry, hi, hi, 0, carry)
+
+func muladd2_nim*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  var carry1, carry2: Carry
+
+  mul_nim(hi, lo, a, b)
+  # Carry chain 1
+  addC_nim(carry1, lo, lo, c1, 0)
+  addC_nim(carry1, hi, hi, 0, carry1)
+  # Carry chain 2
+  addC_nim(carry2, lo, lo, c2, 0)
+  addC_nim(carry2, hi, hi, 0, carry2)
+
+
+func div2n1n_nim*[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T) =
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # doAssert leadingZeros(d) == 0, "Divisor was not normalized"
+
+  const
+    size = sizeof(q) * 8
+    halfSize = size div 2
+    halfMask = (1.T shl halfSize) - 1.T
+
+  template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
+
+    var (q, r) = (n_hi div d_hi, n_hi mod d_hi)
+    let m = q * d_lo
+    r = (r shl halfSize) or n_lo
+
+    # Fix the reminder, we're at most 2 iterations off
+    if r < m:
+      dec q
+      r += d
+      if r >= d and r < m:
+        dec q
+        r += d
+    r -= m
+    (q, r)
+
+  let
+    d_hi = d shr halfSize
+    d_lo = d and halfMask
+    n_lohi = n_lo shr halfSize
+    n_lolo = n_lo and halfMask
+
+  # First half of the quotient
+  let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
+
+  # Second half
+  let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
+
+  q = (q1 shl halfSize) or q2
+  r = r2
\ No newline at end of file
diff --git a/stint/private/primitives/extended_precision.nim b/stint/private/primitives/extended_precision.nim
new file mode 100644
index 0000000..4f58e65
--- /dev/null
+++ b/stint/private/primitives/extended_precision.nim
@@ -0,0 +1,181 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#               Extended precision primitives
+#
+# ############################################################
+
+import
+  ../datatypes,
+  ./addcarry_subborrow
+
+# ############################################################
+#
+#                     32-bit words
+#
+# ############################################################
+
+func div2n1n*(q, r: var uint32, n_hi, n_lo, d: uint32) {.inline.}=
+  ## Division uint64 by uint32
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint32
+  ##   - if n_hi > d result is undefined
+  ##
+  ## To avoid issues, n_hi, n_lo, d should be normalized.
+  ## i.e. shifted (== multiplied by the same power of 2)
+  ## so that the most significant bit in d is set.
+  let dividend = (uint64(n_hi) shl 32) or uint64(n_lo)
+  let divisor = uint64(d)
+  q = uint32(dividend div divisor)
+  r = uint32(dividend mod divisor)
+
+func mul*(hi, lo: var uint32, a, b: uint32) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  let dblPrec = uint64(a) * uint64(b)
+  when nimvm:
+    lo = uint32(dblPrec and uint32.high)
+  else:
+    lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+func muladd1*(hi, lo: var uint32, a, b, c: uint32) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding any c cannot overflow
+  let dblPrec = uint64(a) * uint64(b) + uint64(c)
+  when nimvm:
+    lo = uint32(dblPrec and uint32.high)
+  else:
+    lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+func muladd2*(hi, lo: var uint32, a, b, c1, c2: uint32) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF² -> (hi: 0xFFFFFFFE, lo: 0x00000001)
+  ##       so adding 0xFFFFFFFF leads to (hi: 0xFFFFFFFF, lo: 0x00000000)
+  ##       and we have enough space to add again 0xFFFFFFFF without overflowing
+  let dblPrec = uint64(a) * uint64(b) + uint64(c1) + uint64(c2)
+  when nimvm:
+    lo = uint32(dblPrec and uint32.high)
+  else:
+    lo = uint32(dblPrec)
+  hi = uint32(dblPrec shr 32)
+
+# ############################################################
+#
+#                     64-bit words
+#
+# ############################################################
+
+when sizeof(int) == 8 and not defined(Stint32):
+  from ./compiletime_fallback import div2n1n_nim, mul_nim, muladd1_nim, muladd2_nim
+
+  when defined(vcc):
+    from ./extended_precision_x86_64_msvc import div2n1n_128, mul_128, muladd1_128, muladd2_128
+  elif GCC_Compatible:
+    when X86:
+      from ./extended_precision_x86_64_gcc import div2n1n_128
+      from ./extended_precision_64bit_uint128 import mul_128, muladd1_128, muladd2_128
+    else:
+      from ./extended_precision_64bit_uint128 import div2n1n_128, mul_128, muladd1_128, muladd2_128
+
+  func mul*(hi, lo: var uint64, u, v: uint64) {.inline.}=
+    ## Extended precision multiplication
+    ## (hi, lo) <- u * v
+    when nimvm:
+      mul_nim(hi, lo, u, v)
+    else:
+      mul_128(hi, lo, u, v)
+
+  func muladd1*(hi, lo: var uint64, a, b, c: uint64) {.inline.}=
+    ## Extended precision multiplication + addition
+    ## (hi, lo) <- a*b + c
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding any c cannot overflow
+    when nimvm:
+      muladd1_nim(hi, lo, a, b, c)
+    else:
+      muladd1_128(hi, lo, a, b, c)
+
+  func muladd2*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+    ## Extended precision multiplication + addition + addition
+    ## (hi, lo) <- a*b + c1 + c2
+    ##
+    ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+    ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+    ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+    when nimvm:
+      muladd2_nim(hi, lo, a, b, c1, c2)
+    else:
+      muladd2_128(hi, lo, a, b, c1, c2)
+
+  func div2n1n*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    when nimvm:
+      div2n1n_nim(q, r, n_hi, n_lo, d)
+    else:
+      div2n1n_128(q, r, n_hi, n_lo, d)
+
+# ############################################################
+#
+#                  Composite primitives
+#
+# ############################################################
+
+func mulDoubleAdd2*[T: uint32|uint64](r2: var Carry, r1, r0: var T, a, b, c: T, dHi: Carry, dLo: T) {.inline.} =
+  ## (r2, r1, r0) <- 2*a*b + c + (dHi, dLo)
+  ## with r = (r2, r1, r0) a triple-word number
+  ## and d = (dHi, dLo) a double-word number
+  ## r2 and dHi are carries, either 0 or 1
+
+  var carry: Carry
+
+  # (r1, r0) <- a*b
+  # Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001)
+  mul(r1, r0, a, b)
+
+  # (r2, r1, r0) <- 2*a*b
+  # Then  (hi: 0xFFFFFFFF_FFFFFFFE, lo: 0x00000000_00000001) * 2
+  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFC, lo: 0x00000000_00000002)
+  addC(carry, r0, r0, r0, Carry(0))
+  addC(r2, r1, r1, r1, carry)
+
+  # (r1, r0) <- (r1, r0) + c
+  # Adding any uint64 cannot overflow into r2 for example Adding 2^64-1
+  #       (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
+  addC(carry, r0, r0, c, Carry(0))
+  addC(carry, r1, r1, T(0), carry)
+
+  # (r1, r0) <- (r1, r0) + (dHi, dLo) with dHi a carry (previous limb r2)
+  # (dHi, dLo) is at most (dhi: 1, dlo: 0xFFFFFFFF_FFFFFFFF)
+  # summing into (carry: 1, hi: 0xFFFFFFFF_FFFFFFFD, lo: 0x00000000_00000001)
+  # result at most in (carry: 1, hi: 0xFFFFFFFF_FFFFFFFF, lo: 0x00000000_00000000)
+  addC(carry, r0, r0, dLo, Carry(0))
+  addC(carry, r1, r1, T(dHi), carry)
+
+func mulAcc*[T: uint32|uint64](t, u, v: var T, a, b: T) {.inline.} =
+  ## (t, u, v) <- (t, u, v) + a * b
+  var UV: array[2, T]
+  var carry: Carry
+  mul(UV[1], UV[0], a, b)
+  addC(carry, v, v, UV[0], Carry(0))
+  addC(carry, u, u, UV[1], carry)
+  t += T(carry)
diff --git a/stint/private/primitives/extended_precision_64bit_uint128.nim b/stint/private/primitives/extended_precision_64bit_uint128.nim
new file mode 100644
index 0000000..3289ce0
--- /dev/null
+++ b/stint/private/primitives/extended_precision_64bit_uint128.nim
@@ -0,0 +1,95 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+# Extended precision primitives on GCC & Clang (all CPU archs)
+#
+# ############################################################
+
+static:
+  doAssert GCC_Compatible
+  doAssert sizeof(int) == 8
+
+func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE on some platforms
+  ##   - if n_hi > d result is undefined
+  var dblPrec {.noinit.}: uint128
+  {.emit:[dblPrec, " = (unsigned __int128)", n_hi," << 64 | (unsigned __int128)",n_lo,";"].}
+
+  # Don't forget to dereference the var param in C mode
+  when defined(cpp):
+    {.emit:[q, " = (NU64)(", dblPrec," / ", d, ");"].}
+    {.emit:[r, " = (NU64)(", dblPrec," % ", d, ");"].}
+  else:
+    {.emit:["*",q, " = (NU64)(", dblPrec," / ", d, ");"].}
+    {.emit:["*",r, " = (NU64)(", dblPrec," % ", d, ");"].}
+
+func mul_128*(hi, lo: var uint64, a, b: uint64) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  block:
+    var dblPrec {.noinit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func muladd1_128*(hi, lo: var uint64, a, b, c: uint64) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  ##
+  ## This is constant-time on most hardware
+  ## See: https://www.bearssl.org/ctmul.html
+  block:
+    var dblPrec {.noinit.}: uint128
+    {.emit:[dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b, " + (unsigned __int128)",c,";"].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
+
+func muladd2_128*(hi, lo: var uint64, a, b, c1, c2: uint64) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  block:
+    var dblPrec {.noinit.}: uint128
+    {.emit:[
+      dblPrec, " = (unsigned __int128)", a," * (unsigned __int128)", b,
+               " + (unsigned __int128)",c1," + (unsigned __int128)",c2,";"
+    ].}
+
+    # Don't forget to dereference the var param in C mode
+    when defined(cpp):
+      {.emit:[hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:[lo, " = (NU64)", dblPrec,";"].}
+    else:
+      {.emit:["*",hi, " = (NU64)(", dblPrec," >> ", 64'u64, ");"].}
+      {.emit:["*",lo, " = (NU64)", dblPrec,";"].}
diff --git a/stint/private/primitives/extended_precision_x86_64_gcc.nim b/stint/private/primitives/extended_precision_x86_64_gcc.nim
new file mode 100644
index 0000000..9c7b7ab
--- /dev/null
+++ b/stint/private/primitives/extended_precision_x86_64_gcc.nim
@@ -0,0 +1,57 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../datatypes
+
+# ############################################################
+#
+#   Extended precision primitives for X86-64 on GCC & Clang
+#
+# ############################################################
+
+static:
+  doAssert(defined(gcc) or defined(clang) or defined(llvm_gcc))
+  doAssert sizeof(int) == 8
+  doAssert X86
+
+func div2n1n_128*(q, r: var uint64, n_hi, n_lo, d: uint64) {.inline.}=
+  ## Division uint128 by uint64
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+  # DIV r/m64
+  # Divide RDX:RAX (n_hi:n_lo) by r/m64
+  #
+  # Inputs
+  #   - numerator high word in RDX,
+  #   - numerator low word in RAX,
+  #   - divisor as r/m parameter (register or memory at the compiler discretion)
+  # Result
+  #   - Quotient in RAX
+  #   - Remainder in RDX
+
+  # 1. name the register/memory "divisor"
+  # 2. don't forget to dereference the var hidden pointer
+  # 3. -
+  # 4. no clobbered registers beside explicitly used RAX and RDX
+  when defined(cpp):
+    asm """
+      divq %[divisor]
+      : "=a" (`q`), "=d" (`r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
+  else:
+    asm """
+      divq %[divisor]
+      : "=a" (`*q`), "=d" (`*r`)
+      : "d" (`n_hi`), "a" (`n_lo`), [divisor] "rm" (`d`)
+      :
+    """
diff --git a/stint/private/primitives/extended_precision_x86_64_msvc.nim b/stint/private/primitives/extended_precision_x86_64_msvc.nim
new file mode 100644
index 0000000..3a66457
--- /dev/null
+++ b/stint/private/primitives/extended_precision_x86_64_msvc.nim
@@ -0,0 +1,77 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  ../datatypes,
+  ./addcarry_subborrow
+
+# ############################################################
+#
+#      Extended precision primitives for X86-64 on MSVC
+#
+# ############################################################
+
+static:
+  doAssert defined(vcc)
+  doAssert sizeof(int) == 8
+  doAssert X86
+
+func udiv128(highDividend, lowDividend, divisor: Ct[uint64], remainder: var Ct[uint64]): Ct[uint64] {.importc:"_udiv128", header: "<intrin.h>", nodecl.}
+  ## Division 128 by 64, Microsoft only, 64-bit only,
+  ## returns quotient as return value remainder as var parameter
+  ## Warning ⚠️ :
+  ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+  ##   - if n_hi > d result is undefined
+
+func umul128(a, b: Ct[uint64], hi: var Ct[uint64]): Ct[uint64] {.importc:"_umul128", header:"<intrin.h>", nodecl.}
+  ## (hi, lo) <-- a * b
+  ## Return value is the low word
+
+func div2n1n*(q, r: var Ct[uint64], n_hi, n_lo, d: Ct[uint64]) {.inline.}=
+    ## Division uint128 by uint64
+    ## Warning ⚠️ :
+    ##   - if n_hi == d, quotient does not fit in an uint64 and will throw SIGFPE
+    ##   - if n_hi > d result is undefined
+    q = udiv128(n_hi, n_lo, d, r)
+
+func mul_128*(hi, lo: var Ct[uint64], a, b: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication
+  ## (hi, lo) <- a*b
+  lo = umul128(a, b, hi)
+
+func muladd1_128*(hi, lo: var Ct[uint64], a, b, c: Ct[uint64]) {.inline.} =
+  ## Extended precision multiplication + addition
+  ## (hi, lo) <- a*b + c
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding any c cannot overflow
+  var carry: Carry
+  lo = umul128(a, b, hi)
+  addC(carry, lo, lo, c, Carry(0))
+  addC(carry, hi, hi, 0, carry)
+
+func muladd2_128*(hi, lo: var Ct[uint64], a, b, c1, c2: Ct[uint64]) {.inline.}=
+  ## Extended precision multiplication + addition + addition
+  ## This is constant-time on most hardware except some specific one like Cortex M0
+  ## (hi, lo) <- a*b + c1 + c2
+  ##
+  ## Note: 0xFFFFFFFF_FFFFFFFF² -> (hi: 0xFFFFFFFFFFFFFFFE, lo: 0x0000000000000001)
+  ##       so adding 0xFFFFFFFFFFFFFFFF leads to (hi: 0xFFFFFFFFFFFFFFFF, lo: 0x0000000000000000)
+  ##       and we have enough space to add again 0xFFFFFFFFFFFFFFFF without overflowing
+  # For speed this could be implemented with parallel pipelined carry chains
+  # via MULX + ADCX + ADOX
+  var carry1, carry2: Carry
+
+  lo = umul128(a, b, hi)
+  # Carry chain 1
+  addC(carry1, lo, lo, c1, Carry(0))
+  addC(carry1, hi, hi, 0, carry1)
+  # Carry chain 2
+  addC(carry2, lo, lo, c2, Carry(0))
+  addC(carry2, hi, hi, 0, carry2)
diff --git a/stint/private/uint_addsub.nim b/stint/private/uint_addsub.nim
index c4f11bc..f5e4882 100644
--- a/stint/private/uint_addsub.nim
+++ b/stint/private/uint_addsub.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-Present Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -7,36 +7,51 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./conversion, ./initialization,
-        ./datatypes,
-        ./uint_comparison,
-        ./uint_bitwise_ops
+import
+  # Internal
+  ./datatypes,
+  ./primitives/addcarry_subborrow
 
-# ############ Addition & Substraction ############ #
+# Addsub
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
 
-func `+`*(x, y: UintImpl): UintImpl {.inline.}
-  # Forward declaration
+func sum*(r: var StUint, a, b: StUint) =
+  ## Addition for multi-precision unsigned int
+  var carry = Carry(0)
+  for i in 0 ..< r.limbs.len:
+    addC(carry, r[i], a[i], b[i], carry)
+  r.clearExtraBitsOverMSB()
 
-func `+=`*(x: var UintImpl, y: UintImpl) {.inline.}=
+func `+=`*(a: var StUint, b: StUint) =
   ## In-place addition for multi-precision unsigned int
-  type SubTy = type x.lo
-  x.lo += y.lo
-  x.hi += (x.lo < y.lo).toSubtype(SubTy) + y.hi # This helps the compiler produce ADC (add with carry)
-
-func `+`*(x, y: UintImpl): UintImpl {.inline.}=
-  # Addition for multi-precision unsigned int
-  result = x
-  result += y
-
-func `-`*(x, y: UintImpl): UintImpl {.inline.}=
-  # Substraction for multi-precision unsigned int
-  type SubTy = type x.lo
-  result.lo = x.lo - y.lo
-  result.hi = x.hi - y.hi - (x.lo < y.lo).toSubtype(SubTy) # This might (?) help the compiler produce SBB (sub with borrow)
-
-func `-=`*(x: var UintImpl, y: UintImpl) {.inline.}=
-  ## In-place substraction for multi-precision unsigned int
-  x = x - y
+  a.sum(a, b)
+
+func diff*(r: var StUint, a, b: StUint) =
+  ## Substraction for multi-precision unsigned int
+  var borrow = Borrow(0)
+  for i in 0 ..< r.limbs.len:
+    subB(borrow, r[i], a[i], b[i], borrow)
+  r.clearExtraBitsOverMSB()
 
-func inc*(x: var UintImpl){.inline.}=
-  x += one(type x)
+func `-=`*(a: var StUint, b: StUint) =
+  ## In-place substraction for multi-precision unsigned int
+  a.diff(a, b)
+
+func inc*(a: var StUint, w: Word = 1) =
+  var carry = Carry(0)
+  addC(carry, a.limbs[0], a.limbs[0], w, carry)
+  for i in 1 ..< a.limbs.len:
+    addC(carry, a.limbs[i], a.limbs[i], 0, carry)
+  a.clearExtraBitsOverMSB()
+
+func sum*(r: var StUint, a: StUint, b: SomeUnsignedInt) =
+  ## Addition for multi-precision unsigned int
+  ## with an unsigned integer
+  r = a
+  r.inc(Word(b))
+
+func `+=`*(a: var StUint, b: SomeUnsignedInt) =
+  ## In-place addition for multi-precision unsigned int
+  ## with an unsigned integer
+  a.inc(Word(b))
diff --git a/stint/private/uint_bitwise.nim b/stint/private/uint_bitwise.nim
new file mode 100644
index 0000000..0085a62
--- /dev/null
+++ b/stint/private/uint_bitwise.nim
@@ -0,0 +1,86 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./datatypes
+
+# Bitwise operations
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func bitnot*(r: var StUint, a: StUint) =
+  ## Bitwise complement of unsigned integer a
+  ## i.e. flips all bits of the input
+  for i in 0 ..< r.limbs.len:
+    r[i] = not a[i]
+  r.clearExtraBitsOverMSB()
+
+func bitor*(r: var StUint, a, b: StUint) =
+  ## `Bitwise or` of numbers a and b
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] or b[i]
+
+func bitand*(r: var StUint, a, b: StUint) =
+  ## `Bitwise and` of numbers a and b
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] and b[i]
+
+func bitxor*(r: var StUint, a, b: StUint) =
+  ## `Bitwise xor` of numbers x and y
+  for i in 0 ..< r.limbs.len:
+    r[i] = a[i] xor b[i]
+  r.clearExtraBitsOverMSB()
+
+func countOnes*(a: StUint): int =
+  result = 0
+  for i in 0 ..< a.limbs.len:
+    result += countOnes(a[i])
+
+func parity*(a: StUint): int =
+  result = parity(a.limbs[0])
+  for i in 1 ..< a.limbs.len:
+    result = result xor parity(a.limbs[i])
+
+func leadingZeros*(a: StUint): int =
+  result = 0
+
+  # Adjust when we use only part of the word size
+  var extraBits = WordBitWidth * a.limbs.len - a.bits
+
+  for i in countdown(a.limbs.len-1, 0):
+    let zeroCount = a.limbs[i].leadingZeros()
+    if extraBits > 0:
+      result += zeroCount - min(extraBits, WordBitWidth)
+      extraBits -= WordBitWidth
+    else:
+      result += zeroCount
+    if zeroCount != WordBitWidth:
+      break
+
+func trailingZeros*(a: StUint): int =
+  result = 0
+  for i in 0 ..< a.limbs.len:
+    let zeroCount = a[i].trailingZeros()
+    result += zeroCount
+    if zeroCount != WordBitWidth:
+      break
+
+  when a.limbs.len * WordBitWidth != a.bits:
+    if result > a.bits:
+      result = a.bits
+
+func firstOne*(a: StUint): int =
+  result = trailingZeros(a)
+  if result == a.limbs.len * WordBitWidth:
+    result = 0
+  else:
+    result += 1
diff --git a/stint/private/uint_bitwise_ops.nim b/stint/private/uint_bitwise_ops.nim
deleted file mode 100644
index d208e52..0000000
--- a/stint/private/uint_bitwise_ops.nim
+++ /dev/null
@@ -1,62 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes, ./bitops2_priv
-
-func `not`*(x: UintImpl): UintImpl {.inline.}=
-  ## Bitwise complement of unsigned integer x
-  applyHiLo(x, `not`)
-
-func `or`*(x, y: UintImpl): UintImpl {.inline.}=
-  ## `Bitwise or` of numbers x and y
-  applyHiLo(x, y, `or`)
-
-func `and`*(x, y: UintImpl): UintImpl {.inline.}=
-  ## `Bitwise and` of numbers x and y
-  applyHiLo(x, y, `and`)
-
-func `xor`*(x, y: UintImpl): UintImpl {.inline.}=
-  ## `Bitwise xor` of numbers x and y
-  applyHiLo(x, y, `xor`)
-
-func `shr`*(x: UintImpl, y: SomeInteger): UintImpl {.inline.}
-  # Forward declaration
-
-func `shl`*(x: UintImpl, y: SomeInteger): UintImpl {.inline.}=
-  ## Compute the `shift left` operation of x and y
-  # Note: inlining this poses codegen/aliasing issue when doing `x = x shl 1`
-
-  # TODO: would it be better to reimplement this with words iteration?
-  const halfSize: type(y) = bitsof(x) div 2
-
-  if y == 0:
-    return x
-  elif y == halfSize:
-    result.hi = x.lo
-  elif y < halfSize:
-    result.hi = (x.hi shl y) or (x.lo shr (halfSize - y))
-    result.lo = x.lo shl y
-  else:
-    result.hi = x.lo shl (y - halfSize)
-
-func `shr`*(x: UintImpl, y: SomeInteger): UintImpl {.inline.}=
-  ## Compute the `shift right` operation of x and y
-  ## Similar to C standard, result is undefined if y is bigger
-  ## than the number of bits in x.
-  const halfSize: type(y) = bitsof(x) div 2
-
-  if y == 0:
-    return x
-  elif y == halfSize:
-    result.lo = x.hi
-  elif y < halfSize:
-    result.lo = (x.lo shr y) or (x.hi shl (halfSize - y))
-    result.hi = x.hi shr y
-  else:
-    result.lo = x.hi shr (y - halfSize)
diff --git a/stint/private/uint_comparison.nim b/stint/private/uint_comparison.nim
deleted file mode 100644
index 2364e5c..0000000
--- a/stint/private/uint_comparison.nim
+++ /dev/null
@@ -1,42 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import  ./datatypes
-
-func isZero*(n: SomeUnsignedInt): bool {.inline.} =
-  n == 0
-
-func isZero*(n: UintImpl): bool {.inline.} =
-  n.hi.isZero and n.lo.isZero
-
-func `<`*(x, y: UintImpl): bool {.inline.}=
-  # Lower comparison for multi-precision integers
-  x.hi < y.hi or
-    (x.hi == y.hi and x.lo < y.lo)
-
-func `==`*(x, y: UintImpl): bool {.inline.}=
-  # Equal comparison for multi-precision integers
-  x.hi == y.hi and x.lo == y.lo
-
-func `<=`*(x, y: UintImpl): bool {.inline.}=
-  # Lower or equal comparison for multi-precision integers
-  x.hi < y.hi or
-    (x.hi == y.hi and x.lo <= y.lo)
-
-func isEven*(x: SomeUnsignedInt): bool {.inline.} =
-  (x and 1) == 0
-
-func isEven*(x: UintImpl): bool {.inline.}=
-  x.lo.isEven
-
-func isOdd*(x: SomeUnsignedInt): bool {.inline.} =
-  not x.isEven
-
-func isOdd*(x: UintImpl): bool {.inline.}=
-  not x.isEven
diff --git a/stint/private/uint_div.nim b/stint/private/uint_div.nim
index 8afe840..c0a0798 100644
--- a/stint/private/uint_div.nim
+++ b/stint/private/uint_div.nim
@@ -7,304 +7,253 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  ./bitops2_priv, ./conversion, ./initialization,
-        ./datatypes,
-        ./uint_comparison,
-        ./uint_bitwise_ops,
-        ./uint_addsub,
-        ./uint_mul
-
-# ################### Division ################### #
-# We use the following algorithm:
-#  - Fast recursive division by Burnikel and Ziegler
-
-###################################################################################################################
-##                                                                                                               ##
-##  Grade school division, but with (very) large digits, dividing [a1,a2,a3,a4] by [b1,b2]:                      ##
-##                                                                                                               ##
-##    +----+----+----+----+     +----+----+   +----+                                                             ##
-##    | a1 | a2 | a3 | a4 |  /  | b1 | b2 | = | q1 |        DivideThreeHalvesByTwo(a1a2, a3, b1b2, n, q1, r1r2)  ##
-##    +----+----+----+----+     +----+----+   +----+                                                             ##
-##    +--------------+  |                       |                                                                ##
-##    |   b1b2 * q1  |  |                       |                                                                ##
-##    +--------------+  |                       |                                                                ##
-##  - ================  v                       |                                                                ##
-##         +----+----+----+     +----+----+     |  +----+                                                        ##
-##         | r1 | r2 | a4 |  /  | b1 | b2 | =   |  | q2 |   DivideThreeHalvesByTwo(r1r2, a4, b1b2, n, q1, r1r2)  ##
-##         +----+----+----+     +----+----+     |  +----+                                                        ##
-##         +--------------+                     |    |                                                           ##
-##         |   b1b2 * q2  |                     |    |                                                           ##
-##         +--------------+                     |    |                                                           ##
-##       - ================                     v    v                                                           ##
-##              +----+----+                   +----+----+                                                        ##
-##              | r1 | r2 |                   | q1 | q2 |   r1r2 = a1a2a3a4 mod b1b2, q1q2 = a1a2a3a4 div b1b2   ##
-##              +----+----+                   +----+----+ ,                                                      ##
-##                                                                                                               ##
-##  Note: in the diagram above, a1, b1, q1, r1 etc. are the most significant "digits" of their numbers.          ##
-##                                                                                                               ##
-###################################################################################################################
-
-func div2n1n[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T)
-func div2n1n(q, r: var UintImpl, ah, al, b: UintImpl)
-  # Forward declaration
-
-func divmod*(x, y: SomeUnsignedInt): tuple[quot, rem: SomeUnsignedInt] {.inline.}=
-  # hopefully the compiler fuse that in a single op
-  (x div y, x mod y)
-
-func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]
-  # Forward declaration
-
-func div3n2n[T]( q: var UintImpl[T],
-              r: var UintImpl[UintImpl[T]],
-              a2, a1, a0: UintImpl[T],
-              b: UintImpl[UintImpl[T]]) =
-
-  var
-    c: UintImpl[T]
-    d: UintImpl[UintImpl[T]]
-    carry: bool
-
-  if a2 < b.hi:
-    div2n1n(q, c, a2, a1, b.hi)
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./datatypes,
+  ./uint_bitwise,
+  ./primitives/[addcarry_subborrow, extended_precision]
+
+# Division
+# --------------------------------------------------------
+
+func shortDiv*(a: var Limbs, k: Word): Word =
+  ## Divide `a` by k in-place and return the remainder
+  result = Word(0)
+
+  let clz = leadingZeros(k)
+  let normK = k shl clz
+
+  for i in countdown(a.len-1, 0):
+    # dividend = 2^64 * remainder + a[i]
+    var hi = result
+    var lo = a[i]
+    if hi == 0:
+      if lo < k:
+        a[i] = 0
+      elif lo == k:
+        a[i] = 1
+        result = 0
+      continue
+    # Normalize, shifting the remainder by clz(k) cannot overflow.
+    hi = (hi shl clz) or (lo shr (WordBitWidth - clz))
+    lo = lo shl clz
+    div2n1n(a[i], result, hi, lo, normK)
+    # Undo normalization
+    result = result shr clz
+
+func shlAddMod_multi(a: var openArray[Word], c: Word,
+                     M: openArray[Word], mBits: int): Word =
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ## 
+  ## Specialized for M being a multi-precision integer.
+  ##
+  ## With a word W = 2^WordBitWidth and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ## and returns q = (a * W + c ) / M
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  
+                                        # Assuming 64-bit words
+  let hi = a[^1]                        # Save the high word to detect carries
+  let R = mBits and (WordBitWidth - 1)  # R = mBits mod 64
+
+  var a0, a1, m0: Word
+  if R == 0:                            # If the number of mBits is a multiple of 64
+    a0 = a[^1]                          #
+    copyWords(a, 1, a, 0, a.len-1)      # we can just shift words
+    a[0] = c                            # and replace the first one by c
+    a1 = a[^1]
+    m0 = M[^1]
+  else:                                 # Else: need to deal with partial word shifts at the edge.
+    let clz = WordBitWidth-R
+    a0 = (a[^1] shl clz) or (a[^2] shr R)
+    copyWords(a, 1, a, 0, a.len-1)
+    a[0] = c
+    a1 = (a[^1] shl clz) or (a[^2] shr R)
+    m0 = (M[^1] shl clz) or (M[^2] shr R)
+
+  # m0 has its high bit set. (a0, a1)/m0 fits in a limb.
+  # Get a quotient q, at most we will be 2 iterations off
+  # from the true quotient
+  var q: Word                           # Estimate quotient
+  if a0 == m0:                          # if a_hi == divisor
+    q = high(Word)                      # quotient = MaxWord (0b1111...1111)
+  elif a0 == 0 and a1 < m0:             # elif q == 0, true quotient = 0
+    q = 0
+    return q
   else:
-    q = zero(type q) - one(type q) # We want 0xFFFFF ....
-    c = a1 + b.hi
-    if c < a1:
-      carry = true
-
-  extPrecMul[T](d, q, b.lo)
-  let ca0 = UintImpl[type c](hi: c, lo: a0)
-
-  r = ca0 - d
-
-  if (not carry) and (d > ca0):
-    q -= one(type q)
-    r += b
-
-    # if there was no carry
-    if r > b:
-      q -= one(type q)
-      r += b
-
-proc div3n2n[T: SomeUnsignedInt](
-              q: var T,
-              r: var UintImpl[T],
-              a2, a1, a0: T,
-              b: UintImpl[T]) =
-
-  var
-    c: T
-    d: UintImpl[T]
-    carry: bool
-
-  if a2 < b.hi:
-    div2n1n(q, c, a2, a1, b.hi)
-
+    var r: Word
+    div2n1n(q, r, a0, a1, m0)           # else instead of being of by 0, 1 or 2
+    q -= 1                              # we return q-1 to be off by -1, 0 or 1
+
+  # Now substract a*2^64 - q*m
+  var carry = Word(0)
+  var overM = true                      # Track if quotient greater than the modulus
+
+  for i in 0 ..< M.len:
+    var qm_lo: Word
+    block:                              # q*m
+      # q * p + carry (doubleword) carry from previous limb
+      muladd1(carry, qm_lo, q, M[i], carry)
+
+    block:                              # a*2^64 - q*m
+      var borrow: Borrow
+      subB(borrow, a[i], a[i], qm_lo, Borrow(0))
+      carry += Word(borrow) # Adjust if borrow
+
+    if a[i] != M[i]:
+      overM = a[i] > M[i]
+
+  # Fix quotient, the true quotient is either q-1, q or q+1
+  #
+  # if carry < q or carry == q and overM we must do "a -= M"
+  # if carry > hi (negative result) we must do "a += M"
+  if carry > hi:
+    var c = Carry(0)
+    for i in 0 ..< a.len:
+      addC(c, a[i], a[i], M[i], c)
+    q -= 1
+  elif overM or (carry < hi):
+    var b = Borrow(0)
+    for i in 0 ..< a.len:
+      subB(b, a[i], a[i], M[i], b)
+    q += 1
+
+  return q
+
+func shlAddMod(a: var openArray[Word], c: Word,
+               M: openArray[Word], mBits: int): Word {.inline.}=
+  ## Fused modular left-shift + add
+  ## Shift input `a` by a word and add `c` modulo `M`
+  ## 
+  ## With a word W = 2^WordBitWidth and a modulus M
+  ## Does a <- a * W + c (mod M)
+  ## and returns q = (a * W + c ) / M
+  ##
+  ## The modulus `M` most-significant bit at `mBits` MUST be set.
+  if mBits <= WordBitWidth:
+    # If M fits in a single limb
+
+    # We normalize M with clz so that the MSB is set
+    # And normalize (a * 2^64 + c) by R as well to maintain the result
+    # This ensures that (a0, a1)/p0 fits in a limb.
+    let R = mBits and (WordBitWidth - 1)
+    let clz = WordBitWidth-R
+
+    # (hi, lo) = a * 2^64 + c
+    let hi = (a[0] shl clz) or (c shr R)
+    let lo = c shl clz
+    let m0 = M[0] shl clz
+
+    var q, r: Word
+    div2n1n(q, r, hi, lo, m0)
+    a[0] = r shr clz
+    return q
   else:
-    q = 0.T - 1.T # We want 0xFFFFF ....
-    c = a1 + b.hi
-    if c < a1:
-      carry = true
-
-  extPrecMul[T](d, q, b.lo)
-  let ca0 = UintImpl[T](hi: c, lo: a0)
-  r = ca0 - d
-
-  if (not carry) and d > ca0:
-    dec q
-    r += b
-
-    # if there was no carry
-    if r > b:
-      dec q
-      r += b
-
-func div2n1n(q, r: var UintImpl, ah, al, b: UintImpl) =
-
-  # doAssert leadingZeros(b) == 0, "Divisor was not normalized"
-
-  var s: UintImpl
-  div3n2n(q.hi, s, ah.hi, ah.lo, al.hi, b)
-  div3n2n(q.lo, r, s.hi, s.lo, al.lo, b)
-
-func div2n1n[T: SomeUnsignedInt](q, r: var T, n_hi, n_lo, d: T) =
-
-  # doAssert leadingZeros(d) == 0, "Divisor was not normalized"
-
-  const
-    size = bitsof(q)
-    halfSize = size div 2
-    halfMask = (1.T shl halfSize) - 1.T
-
-  template halfQR(n_hi, n_lo, d, d_hi, d_lo: T): tuple[q,r: T] =
-
-    var (q, r) = divmod(n_hi, d_hi)
-    let m = q * d_lo
-    r = (r shl halfSize) or n_lo
-
-    # Fix the reminder, we're at most 2 iterations off
-    if r < m:
-      dec q
-      r += d
-      if r >= d and r < m:
-        dec q
-        r += d
-    r -= m
-    (q, r)
-
-  let
-    d_hi = d shr halfSize
-    d_lo = d and halfMask
-    n_lohi = n_lo shr halfSize
-    n_lolo = n_lo and halfMask
-
-  # First half of the quotient
-  let (q1, r1) = halfQR(n_hi, n_lohi, d, d_hi, d_lo)
-
-  # Second half
-  let (q2, r2) = halfQR(r1, n_lolo, d, d_hi, d_lo)
-
-  q = (q1 shl halfSize) or q2
-  r = r2
-
-func divmodBZ[T](x, y: UintImpl[T], q, r: var UintImpl[T])=
-
-  doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
-
-  if y.hi.isZero:
-    # Shortcut if divisor is smaller than half the size of the type
-    if x.hi < y.lo:
-      # Normalize
-      let
-        clz = leadingZeros(y.lo)
-        xx = x shl clz
-        yy = y.lo shl clz
-
-      # If y is smaller than the base, normalizing x does not overflow.
-      # Compute directly the low part
-      div2n1n(q.lo, r.lo, xx.hi, xx.lo, yy)
-      # Undo normalization
-      r.lo = r.lo shr clz
-      return
-
-  # General case
-
-  # Normalization
-  let clz = leadingZeros(y)
-
-  let
-    xx = UintImpl[type x](lo: x) shl clz
-    yy = y shl clz
-
-  # Compute
-  div2n1n(q, r, xx.hi, xx.lo, yy)
-
-  # Undo normalization
-  r = r shr clz
-
-func divmodBS(x, y: UintImpl, q, r: var UintImpl) =
-  ## Division for multi-precision unsigned uint
-  ## Implementation through binary shift division
-
-  doAssert y.isZero.not() # This should be checked on release mode in the divmod caller proc
-
-  type SubTy = type x.lo
-
-  var
-    shift = y.leadingZeros - x.leadingZeros
-    d = y shl shift
-
-  r = x
-
-  while shift >= 0:
-    q += q
-    if r >= d:
-      r -= d
-      q.lo = q.lo or one(SubTy)
-
-    d = d shr 1
-    dec(shift)
-
-const BinaryShiftThreshold = 8  # If the difference in bit-length is below 8
-                                # binary shift is probably faster
-
-func divmod*[T](x, y: UintImpl[T]): tuple[quot, rem: UintImpl[T]]=
-
-  let x_clz = x.leadingZeros
-  let y_clz = y.leadingZeros
-
-  # We short-circuit division depending on special-cases.
-  # TODO: Constant-time division
-  if unlikely(y.isZero):
-    raise newException(DivByZeroDefect, "You attempted to divide by zero")
-  elif y_clz == (bitsof(y) - 1):
-    # y is one
-    result.quot = x
-  elif (x.hi or y.hi).isZero:
-    # If computing just on the low part is enough
-    (result.quot.lo, result.rem.lo) = divmod(x.lo, y.lo)
-  elif (y and (y - one(type y))).isZero:
-    # y is a power of 2. (this also matches 0 but it was eliminated earlier)
-    # TODO. Would it be faster to use countTrailingZero (ctz) + clz == size(y) - 1?
-    #       Especially because we shift by ctz after.
-    #       It is a bit tricky with recursive types. An empty n.lo means 0 or sizeof(n.lo)
-    let y_ctz = bitsof(y) - y_clz - 1
-    result.quot = x shr y_ctz
-    result.rem = x and (y - one(type y))
-  elif x == y:
-    result.quot.lo = one(T)
-  elif x < y:
-    result.rem = x
-  elif (y_clz - x_clz) < BinaryShiftThreshold:
-    divmodBS(x, y, result.quot, result.rem)
+    return shlAddMod_multi(a, c, M, mBits)
+
+func divRem*(
+       q, r: var openArray[Word],
+       a, b: openArray[Word]
+     ) =
+  let (aBits, aLen) = usedBitsAndWords(a)
+  let (bBits, bLen) = usedBitsAndWords(b)
+  let rLen = bLen
+
+  if unlikely(bBits == 0):
+    raise newException(DivByZeroError, "You attempted to divide by zero")
+
+  if aBits < bBits:
+    # if a uses less bits than b,
+    # a < b, so q = 0 and r = a
+    copyWords(r, 0, a, 0, aLen)
+    for i in aLen ..< r.len: # r.len >= rLen
+      r[i] = 0
+    for i in 0 ..< q.len:
+      q[i] = 0
   else:
-    divmodBZ(x, y, result.quot, result.rem)
-
-func `div`*(x, y: UintImpl): UintImpl {.inline.} =
-  ## Division operation for multi-precision unsigned uint
-  divmod(x,y).quot
-
-func `mod`*(x, y: UintImpl): UintImpl {.inline.} =
-  ## Division operation for multi-precision unsigned uint
-  divmod(x,y).rem
-
+    # The length of a is at least the divisor
+    # We can copy bLen-1 words
+    # and modular shift-lef-add the rest
+    let aOffset = aLen - bLen
+    copyWords(r, 0, a, aOffset+1, bLen-1)
+    r[rLen-1] = 0
+    # Now shift-left the copied words while adding the new word mod b
+    for i in countdown(aOffset, 0):
+      q[i] = shlAddMod(
+        r.toOpenArray(0, rLen-1),
+        a[i],
+        b.toOpenArray(0, bLen-1),
+        bBits
+      )
+
+    # Clean up extra words
+    for i in aOffset+1 ..< q.len:
+      q[i] = 0
+    for i in rLen ..< r.len:
+      r[i] = 0
 
 # ######################################################################
 # Division implementations
 #
-# Division is the most costly operation
-# And also of critical importance for cryptography application
+# Multi-precision division is a costly
+# and also difficult to implement operation
 
 # ##### Research #####
 
 # Overview of division algorithms:
 # - https://gmplib.org/manual/Division-Algorithms.html#Division-Algorithms
 # - https://gmplib.org/~tege/division-paper.pdf
-# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Hasselstrom2003.pdf
-
-# Libdivide has an implementations faster than hardware if dividing by the same number is needed
-# - http://libdivide.com/documentation.html
-# - https://github.com/ridiculousfish/libdivide/blob/master/libdivide.h
-# Furthermore libdivide also has branchless implementations
-
-# Implementation: we use recursive fast division by Burnikel and Ziegler.
+# - Comparison of fast division algorithms for large integers: http://bioinfo.ict.ac.cn/~dbu/AlgorithmCourses/Lectures/Lec5-Fast-Division-Hasselstrom2003.pdf
+
+# Schoolbook / Knuth Division (Algorithm D)
+# - https://skanthak.homepage.t-online.de/division.html
+#   Review of implementation flaws
+# - Hacker's Delight https://github.com/hcs0/Hackers-Delight/blob/master/divmnu64.c.txt
+# - LLVM: https://github.com/llvm-mirror/llvm/blob/2c4ca68/lib/Support/APInt.cpp#L1289-L1451
+# - ctbignum: https://github.com/niekbouman/ctbignum/blob/v0.5/include/ctbignum/division.hpp
+# - Modern Computer Arithmetic - https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+#   p14 - 1.4.1 Naive Division
+# - Handbook of Applied Cryptography - https://cacr.uwaterloo.ca/hac/about/chap14.pdf
+#   Chapter 14 algorithm 14.2.5
+
+# Smith Method (and derivatives)
+# This method improves Knuth algorithm by ~3x by removing regular normalization
+# - A Multiple-Precision Division Algorithm, David M Smith
+#   American mathematical Society, 1996
+#   https://www.ams.org/journals/mcom/1996-65-213/S0025-5718-96-00688-6/S0025-5718-96-00688-6.pdf
 #
-# It is build upon divide and conquer algorithm that can be found in:
-# - Hacker's delight: http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt
-# - Libdivide
-# - Code project: https://www.codeproject.com/Tips/785014/UInt-Division-Modulus
-# - Cuda-uint128 (unfinished): https://github.com/curtisseizert/CUDA-uint128/blob/master/cuda_uint128.h
-# - Mpdecimal: https://github.com/status-im/nim-decimal/blob/9b65e95299cb582b14e0ae9a656984a2ce0bab03/decimal/mpdecimal_wrapper/generated/basearith.c#L305-L412
-
-# Description of recursive fast division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
-#  - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
-#  - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
-#  - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
+# - An Efficient Multiple-Precision Division Algorithm,
+#   Liusheng Huang, Hong Zhong, Hong Shen, Yonglong Luo, 2005
+#   https://ieeexplore.ieee.org/document/1579076
+# 
+# - Efficient multiple-precision integer division algorithm
+#   Debapriyay Mukhopadhyaya, Subhas C.Nandy, 2014
+#   https://www.sciencedirect.com/science/article/abs/pii/S0020019013002627
+
+# Recursive division by Burnikel and Ziegler (http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz):
+# - Python implementation: https://bugs.python.org/file11060/fast_div.py and discussion https://bugs.python.org/issue3451
+# - C++ implementation: https://github.com/linbox-team/givaro/blob/master/src/kernel/recint/rudiv.h
+# - The Handbook of Elliptic and Hyperelliptic Cryptography Algorithm 10.35 on page 188 has a more explicit version of the div2NxN algorithm. This algorithm is directly recursive and avoids the mutual recursion of the original paper's calls between div2NxN and div3Nx2N.
+# - Modern Computer Arithmetic - https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+#   p18 - 1.4.3 Divide and Conquer Division
+
+# Newton Raphson Iterations
+# - Putty (constant-time): https://github.com/github/putty/blob/0.74/mpint.c#L1818-L2112
+# - Modern Computer Arithmetic - https://members.loria.fr/PZimmermann/mca/mca-cup-0.5.9.pdf
+#   p18 - 1.4.3 Divide and Conquer Division
 
 # Other libraries that can be used as reference for alternative (?) implementations:
 # - TTMath: https://github.com/status-im/nim-ttmath/blob/8f6ff2e57b65a350479c4012a53699e262b19975/src/headers/ttmathuint.h#L1530-L2383
 # - LibTomMath: https://github.com/libtom/libtommath
-# - Google Abseil: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric
-# - Crypto libraries like libsecp256k1, OpenSSL, ... though they are not generics. (uint256 only for example)
+# - Google Abseil for uint128: https://github.com/abseil/abseil-cpp/tree/master/absl/numeric
 # Note: GMP/MPFR are GPL. The papers can be used but not their code.
+
+# Related research
+# - Efficient divide-and-conquer multiprecision integer division
+#   William Hart, IEEE 2015
+#   https://github.com/wbhart/bsdnt
+#   https://ieeexplore.ieee.org/document/7203801
\ No newline at end of file
diff --git a/stint/private/uint_exp.nim b/stint/private/uint_exp.nim
deleted file mode 100644
index 1103ed7..0000000
--- a/stint/private/uint_exp.nim
+++ /dev/null
@@ -1,50 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./datatypes,
-  ./uint_bitwise_ops, ./uint_mul, ./initialization, ./uint_comparison
-
-func pow*(x: UintImpl, y: Natural): UintImpl =
-  ## Compute ``x`` to the power of ``y``,
-  ## ``x`` must be non-negative
-
-  # Implementation uses exponentiation by squaring
-  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
-  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
-
-  var (x, y) = (x, y)
-  result = one(type x)
-
-  while true:
-    if bool(y and 1): # if y is odd
-      result = result * x
-    y = y shr 1
-    if y == 0:
-      break
-    x = x * x
-
-func pow*(x: UintImpl, y: UintImpl): UintImpl =
-  ## Compute ``x`` to the power of ``y``,
-  ## ``x`` must be non-negative
-
-  # Implementation uses exponentiation by squaring
-  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
-  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
-
-  var (x, y) = (x, y)
-  result = one(type x)
-
-  while true:
-    if y.isOdd:
-      result = result * x
-    y = y shr 1
-    if y.isZero:
-      break
-    x = x * x
diff --git a/stint/private/uint_highlow.nim b/stint/private/uint_highlow.nim
deleted file mode 100644
index b70af0f..0000000
--- a/stint/private/uint_highlow.nim
+++ /dev/null
@@ -1,16 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ./datatypes, ./initialization, ./uint_bitwise_ops
-
-func low*(T: typedesc[UintImpl]): T {.inline.}=
-  zero(T)
-
-func high*(T: typedesc[UintImpl]): T {.inline.}=
-  not zero(T)
diff --git a/stint/private/uint_mul.nim b/stint/private/uint_mul.nim
index 055289c..1155344 100644
--- a/stint/private/uint_mul.nim
+++ b/stint/private/uint_mul.nim
@@ -1,5 +1,5 @@
 # Stint
-# Copyright 2018 Status Research & Development GmbH
+# Copyright 2018-Present Status Research & Development GmbH
 # Licensed under either of
 #
 #  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
@@ -7,160 +7,79 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  macros,
-        ./conversion,
-        ./initialization,
-        ./datatypes,
-        ./uint_comparison,
-        ./uint_addsub
-
-# ################### Multiplication ################### #
-
-func lo(x: uint64): uint64 {.inline.} =
-  const
-    p: uint64 = 32
-    base: uint64 = 1'u64 shl p
-    mask: uint64 = base - 1
-  result = x and mask
-
-func hi(x: uint64): uint64 {.inline.} =
-  const
-    p = 32
-  result = x shr p
-
-# No generic, somehow Nim is given ambiguous call with the T: UintImpl overload
-func extPrecMul*(result: var UintImpl[uint8], x, y: uint8) {.inline.}=
-  ## Extended precision multiplication
-  result = cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-func extPrecMul*(result: var UintImpl[uint16], x, y: uint16) {.inline.}=
-  ## Extended precision multiplication
-  result = cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-func extPrecMul*(result: var UintImpl[uint32], x, y: uint32) {.inline.}=
-  ## Extended precision multiplication
-  result = cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-func extPrecAddMul[T: uint8 or uint16 or uint32](result: var UintImpl[T], x, y: T) {.inline.}=
-  ## Extended precision fused in-place addition & multiplication
-  result += cast[type result](x.asDoubleUint * y.asDoubleUint)
-
-template extPrecMulImpl(result: var UintImpl[uint64], op: untyped, u, v: uint64) =
-  const
-    p = 64 div 2
-    base: uint64 = 1'u64 shl p
-
-  var
-    x0, x1, x2, x3: uint64
-
-  let
-    ul = lo(u)
-    uh = hi(u)
-    vl = lo(v)
-    vh = hi(v)
-
-  x0 = ul * vl
-  x1 = ul * vh
-  x2 = uh * vl
-  x3 = uh * vh
-
-  x1 += hi(x0)          # This can't carry
-  x1 += x2              # but this can
-  if x1 < x2:           # if carry, add it to x3
-    x3 += base
-
-  op(result.hi, x3 + hi(x1))
-  op(result.lo, (x1 shl p) or lo(x0))
-
-func extPrecMul*(result: var UintImpl[uint64], u, v: uint64) =
-  ## Extended precision multiplication
-  extPrecMulImpl(result, `=`, u, v)
-
-func extPrecAddMul(result: var UintImpl[uint64], u, v: uint64) =
-  ## Extended precision fused in-place addition & multiplication
-  extPrecMulImpl(result, `+=`, u, v)
-
-macro eqSym(x, y: untyped): untyped =
-  let eq = $x == $y # Unfortunately eqIdent compares to string.
-  result = newLit eq
-
-func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
-func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T])
-  # Forward declaration
-
-template extPrecMulImpl*[T](result: var UintImpl[UintImpl[T]], op: untyped, x, y: UintImpl[T]) =
-  # See details at
-  # https://en.wikipedia.org/wiki/Karatsuba_algorithm
-  # https://locklessinc.com/articles/256bit_arithmetic/
-  # https://www.miracl.com/press/missing-a-trick-karatsuba-variations-michael-scott
-  #
-  # We use the naive school grade multiplication instead of Karatsuba I.e.
-  # z1 = x.hi * y.lo + x.lo * y.hi (Naive) = (x.lo - x.hi)(y.hi - y.lo) + z0 + z2 (Karatsuba)
-  #
-  # On modern architecture:
-  #   - addition and multiplication have the same cost
-  #   - Karatsuba would require to deal with potentially negative intermediate result
-  #     and introduce branching
-  #   - More total operations means more register moves
-
-  var z1: type x
-
-  # Low part and hi part - z0 & z2
-  when eqSym(op, `+=`):
-    extPrecAddMul(result.lo, x.lo, y.lo)
-    extPrecAddMul(result.hi, x.hi, y.hi)
-  else:
-    extPrecMul(result.lo, x.lo, y.lo)
-    extPrecMul(result.hi, x.hi, y.hi)
-
-  ## TODO - fuse those parts and reduce the number of carry checks
-  # Middle part - z1 - 1st mul
-  extPrecMul(z1, x.hi, y.lo)
-  result.lo.hi += z1.lo
-  if result.lo.hi < z1.lo:
-    inc result.hi
-
-  result.hi.lo += z1.hi
-  if result.hi.lo < z1.hi:
-    inc result.hi.hi
-
-  # Middle part - z1 - 2nd mul
-  extPrecMul(z1, x.lo, y.hi)
-  result.lo.hi += z1.lo
-  if result.lo.hi < z1.lo:
-    inc result.hi
-
-  result.hi.lo += z1.hi
-  if result.hi.lo < z1.hi:
-    inc result.hi.hi
-
-func extPrecAddMul[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
-  ## Extended precision fused in-place addition & multiplication
-  extPrecMulImpl(result, `+=`, u, v)
-
-func extPrecMul*[T](result: var UintImpl[UintImpl[T]], u, v: UintImpl[T]) =
-  ## Extended precision multiplication
-  extPrecMulImpl(result, `=`, u, v)
-
-func `*`*[T](x, y: UintImpl[T]): UintImpl[T] {.inline.}=
-  ## Multiplication for multi-precision unsigned uint
-  #
-  # For our representation, it is similar to school grade multiplication
-  # Consider hi and lo as if they were digits
+import
+  ./datatypes,
+  ./primitives/extended_precision
+
+# Multiplication
+# --------------------------------------------------------
+{.push raises: [], gcsafe.}
+
+func prod*[rLen, aLen, bLen: static int](r: var Limbs[rLen], a: Limbs[aLen], b: Limbs[bLen]) =
+  ## Multi-precision multiplication
+  ## r <- a*b
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len
+  ## The result will be truncated, i.e. it will be
+  ## a * b (mod (2^WordBitwidth)^r.limbs.len)
+
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Word(0)
+  var z: typeof(r) # zero-init, ensure on stack and removes in-place problems
+
+  staticFor i, 0, min(a.len+b.len, r.len):
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    z[i] = v
+    v = u
+    u = t
+    t = 0
+
+  r = z
+
+func prod_high_words*[rLen, aLen, bLen: static int](
+       r: var Limbs[rLen],
+       a: Limbs[aLen], b: Limbs[bLen],
+       lowestWordIndex: static int) =
+  ## Multi-precision multiplication keeping only high words
+  ## r <- a*b >> (2^WordBitWidth)^lowestWordIndex
+  ##
+  ## `a`, `b`, `r` can have a different number of limbs
+  ## if `r`.limbs.len < a.limbs.len + b.limbs.len - lowestWordIndex
+  ## The result will be truncated, i.e. it will be
+  ## a * b >> (2^WordBitWidth)^lowestWordIndex (mod (2^WordBitwidth)^r.limbs.len)
   #
-  #     12
-  # X   15
-  # ------
-  #     10   lo*lo -> z0
-  #     5    hi*lo -> z1
-  #     2    lo*hi -> z1
-  #    10    hi*hi -- z2
-  # ------
-  #    180
-  #
-  # If T is a type
-  # For T * T --> T we don't need to compute z2 as it always overflow
-  # For T * T --> 2T (uint64 * uint64 --> uint128) we use extra precision multiplication
-
-  extPrecMul(result, x.lo, y.lo)
-  result.hi += x.lo * y.hi + x.hi * y.lo
+  # This is useful for
+  # - Barret reduction
+  # - Approximating multiplication by a fractional constant in the form f(a) = K/C * a
+  #   with K and C known at compile-time.
+  #   We can instead find a well chosen M = (2^WordBitWidth)^w, with M > C (i.e. M is a power of 2 bigger than C)
+  #   Precompute P = K*M/C at compile-time
+  #   and at runtime do P*a/M <=> P*a >> (WordBitWidth*w)
+  #   i.e. prod_high_words(result, P, a, w)
+
+  # We use Product Scanning / Comba multiplication
+  var t, u, v = Word(0) # Will raise warning on empty iterations
+  var z: Limbs[rLen] # zero-init, ensure on stack and removes in-place problems
+
+  # The previous 2 columns can affect the lowest word due to carries
+  # but not the ones before (we accumulate in 3 words (t, u, v))
+  const w = lowestWordIndex - 2
+
+  staticFor i, max(0, w), min(a.len+b.len, r.len+lowestWordIndex):
+    const ib = min(b.len-1, i)
+    const ia = i - ib
+    staticFor j, 0, min(a.len - ia, ib+1):
+      mulAcc(t, u, v, a[ia+j], b[ib-j])
+
+    when i >= lowestWordIndex:
+      z[i-lowestWordIndex] = v
+    v = u
+    u = t
+    t = Word(0)
+
+  r = z
diff --git a/stint/private/uint_shift.nim b/stint/private/uint_shift.nim
new file mode 100644
index 0000000..093db7a
--- /dev/null
+++ b/stint/private/uint_shift.nim
@@ -0,0 +1,116 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Status lib
+  stew/bitops2,
+  # Internal
+  ./datatypes
+
+# Shifts
+# --------------------------------------------------------
+{.push raises: [], gcsafe.}
+
+func shrSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
+  ## Shift right by k.
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  # Note: for speed, loading a[i] and a[i+1]
+  #       instead of a[i-1] and a[i]
+  #       is probably easier to parallelize for the compiler
+  #       (antidependence WAR vs loop-carried dependence RAW)
+  for i in 0 ..< a.len-1:
+    r[i] = (a[i] shr k) or (a[i+1] shl (WordBitWidth - k))
+  r[^1] = a[^1] shr k
+
+func shrLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
+  ## Shift right by `w` words + `shift` bits
+  ## Assumes `r` is 0 initialized
+  if w > Limbs.len:
+    return
+
+  for i in w ..< a.len-1:
+    r[i-w] = (a[i] shr shift) or (a[i+1] shl (WordBitWidth - shift))
+  r[^(1+w)] = a[^1] shr shift
+
+func shrWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
+  ## Shift right by w word
+  for i in 0 ..< Limbs.len-w:
+    r[i] = a[i+w]
+  for i in Limbs.len-w ..< Limbs.len:
+    r[i] = 0
+
+func shlSmall*(r: var Limbs, a: Limbs, k: SomeInteger) =
+  ## Compute the `shift left` operation of x and k
+  ##
+  ## k MUST be less than the base word size (2^32 or 2^64)
+  r[0] = a[0] shl k
+  for i in 1 ..< a.len:
+    r[i] = (a[i] shl k) or (a[i-1] shr (WordBitWidth - k))
+
+func shlLarge*(r: var Limbs, a: Limbs, w, shift: SomeInteger) =
+  ## Shift left by `w` words + `shift` bits
+  ## Assumes `r` is 0 initialized
+  if w > Limbs.len:
+    return
+
+  r[w] = a[0] shl shift
+  for i in 1+w ..< r.len:
+    r[i] = (a[i-w] shl shift) or (a[i-w-1] shr (WordBitWidth - shift))
+
+func shlWords*(r: var Limbs, a: Limbs, w: SomeInteger) =
+  ## Shift left by w word
+  for i in 0 ..< w:
+    r[i] = 0
+  for i in 0 ..< Limbs.len-w:
+    r[i+w] = a[i]
+
+# Wrappers
+# --------------------------------------------------------
+
+func shiftRight*(r: var StUint, a: StUint, k: SomeInteger) =
+  ## Shift `a` right by k bits and store in `r`
+  if k == 0:
+    r = a
+    return
+
+  if k < WordBitWidth:
+    r.limbs.shrSmall(a.limbs, k)
+    return
+
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    r.limbs.shrWords(a.limbs, w)
+  else:
+    r.limbs.shrLarge(a.limbs, w, shift)
+
+func shiftLeft*(r: var StUint, a: StUint, k: SomeInteger) =
+  ## Shift `a` left by k bits and store in `r`
+  if k == 0:
+    r = a
+    return
+
+  if k < WordBitWidth:
+    r.limbs.shlSmall(a.limbs, k)
+    r.clearExtraBitsOverMSB()
+    return
+
+  # w = k div WordBitWidth, shift = k mod WordBitWidth
+  let w     = k shr static(log2trunc(uint32(WordBitWidth)))
+  let shift = k and (WordBitWidth - 1)
+
+  if shift == 0:
+    r.limbs.shlWords(a.limbs, w)
+  else:
+    r.limbs.shlLarge(a.limbs, w, shift)
+
+  r.clearExtraBitsOverMSB()
diff --git a/stint/uintops.nim b/stint/uintops.nim
new file mode 100644
index 0000000..bedccce
--- /dev/null
+++ b/stint/uintops.nim
@@ -0,0 +1,243 @@
+# Stint
+# Copyright 2018-Present Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internal
+  ./private/datatypes,
+  ./private/uint_bitwise,
+  ./private/uint_shift,
+  ./private/uint_addsub,
+  ./private/uint_mul,
+  ./private/uint_div,
+  ./private/primitives/addcarry_subborrow
+
+export StUint
+
+# Initialization
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func setZero*(a: var StUint) =
+  ## Set ``a`` to 0
+  for i in 0 ..< a.limbs.len:
+    a.limbs[i] = 0
+
+func setSmallInt(a: var StUint, k: Word) =
+  ## Set ``a`` to k
+  a.limbs[0] = k
+  for i in 1 ..< a.limbs.len:
+    a.limbs[i] = 0
+
+func setOne*(a: var StUint) =
+  setSmallInt(a, 1)
+
+func zero*[bits: static[int]](T: typedesc[StUint[bits]]): T {.inline.} =
+  ## Returns the zero of the input type
+  discard
+
+func one*[bits: static[int]](T: typedesc[StUint[bits]]): T {.inline.} =
+  ## Returns the one of the input type
+  result.setOne()
+
+func high*[bits](_: typedesc[StUint[bits]]): StUint[bits] {.inline.} =
+  for i in 0 ..< result.limbs.len:
+    result[i] = high(Word)
+
+func low*[bits](_: typedesc[StUint[bits]]): StUint[bits] {.inline.} =
+  discard
+
+{.pop.}
+# Comparisons
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func isZero*(a: StUint): bool =
+  for i in 0 ..< a.limbs.len:
+    if a[i] != 0:
+      return false
+  return true
+
+func `==`*(a, b: StUint): bool {.inline.} =
+  ## Unsigned `equal` comparison
+  for i in 0 ..< a.limbs.len:
+    if a[i] != b[i]:
+      return false
+  return true
+
+func `<`*(a, b: StUint): bool {.inline.} =
+  ## Unsigned `less than` comparison
+  var diff: Word
+  var borrow: Borrow
+  for i in 0 ..< a.limbs.len:
+    subB(borrow, diff, a[i], b[i], borrow)
+  return bool(borrow)
+
+func `<=`*(a, b: StUint): bool {.inline.} =
+  ## Unsigned `less or equal` comparison
+  not(b < a)
+
+func isOdd*(a: StUint): bool {.inline.} =
+  ## Returns true if input is off
+  ## false otherwise
+  bool(a[0] and 1)
+
+func isEven*(a: StUint): bool {.inline.} =
+  ## Returns true if input is zero
+  ## false otherwise
+  not a.isOdd()
+
+{.pop.}
+# Bitwise operations
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func `not`*(a: StUint): StUint =
+  ## Bitwise complement of unsigned integer a
+  ## i.e. flips all bits of the input
+  result.bitnot(a)
+
+func `or`*(a, b: StUint): StUint =
+  ## `Bitwise or` of numbers a and b
+  result.bitor(a, b)
+
+func `and`*(a, b: StUint): StUint =
+  ## `Bitwise and` of numbers a and b
+  result.bitand(a, b)
+
+func `xor`*(a, b: StUint): StUint =
+  ## `Bitwise xor` of numbers x and y
+  result.bitxor(a, b)
+
+{.pop.} # End noInit
+
+export
+  countOnes,
+  parity,
+  leadingZeros,
+  trailingZeros,
+  firstOne
+
+{.push raises: [], inline, gcsafe.}
+
+func `shr`*(a: StUint, k: SomeInteger): StUint =
+  ## Shift right by k bits
+  result.shiftRight(a, k)
+
+func `shl`*(a: StUint, k: SomeInteger): StUint =
+  ## Shift left by k bits
+  result.shiftLeft(a, k)
+
+{.pop.}
+
+# Addsub
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func `+`*(a, b: StUint): StUint =
+  ## Addition for multi-precision unsigned int
+  result.sum(a, b)
+
+export `+=`
+
+func `-`*(a, b: StUint): StUint =
+  ## Substraction for multi-precision unsigned int
+  result.diff(a, b)
+
+export `-=`
+
+export inc
+
+func `+`*(a: StUint, b: SomeUnsignedInt): StUint =
+  ## Addition for multi-precision unsigned int
+  ## with an unsigned integer
+  result.sum(a, Word(b))
+
+export `+=`
+
+{.pop.}
+
+# Multiplication
+# --------------------------------------------------------
+# Multiplication is implemented in a separate file at the limb-level
+# - It's too big to be inlined (especially with unrolled loops)
+# - It's implemented at the limb-level so that
+#   in the future Stuint[254] and Stuint256] share a common codepath
+
+{.push raises: [], inline, noinit, gcsafe.}
+
+func `*`*(a, b: StUint): StUint =
+  ## Integer multiplication
+  result.limbs.prod(a.limbs, b.limbs)
+  result.clearExtraBitsOverMSB()
+
+{.pop.}
+
+# Exponentiation
+# --------------------------------------------------------
+
+{.push raises: [], noinit, gcsafe.}
+
+func pow*(a: StUint, e: Natural): StUint =
+  ## Compute ``a`` to the power of ``e``,
+  ## ``e`` must be non-negative
+
+  # Implementation uses exponentiation by squaring
+  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
+  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
+
+  var (a, e) = (a, e)
+  result.setOne()
+
+  while true:
+    if bool(e and 1): # if y is odd
+      result = result * a
+    e = e shr 1
+    if e == 0:
+      break
+    a = a * a
+
+func pow*[aBits, eBits](a: StUint[aBits], e: StUint[eBits]): StUint[aBits] =
+  ## Compute ``x`` to the power of ``y``,
+  ## ``x`` must be non-negative
+  # Implementation uses exponentiation by squaring
+  # See Nim math module: https://github.com/nim-lang/Nim/blob/4ed24aa3eb78ba4ff55aac3008ec3c2427776e50/lib/pure/math.nim#L429
+  # And Eli Bendersky's blog: https://eli.thegreenplace.net/2009/03/21/efficient-integer-exponentiation-algorithms
+
+  var (a, e) = (a, e)
+  result.setOne()
+
+  while true:
+    if e.isOdd:
+      result = result * a
+    e = e shr 1
+    if e.isZero:
+      break
+    a = a * a
+
+{.pop.}
+
+# Division & Modulo
+# --------------------------------------------------------
+{.push raises: [], inline, noinit, gcsafe.}
+
+func `div`*(x, y: StUint): StUint =
+  ## Division operation for multi-precision unsigned uint
+  var tmp{.noinit.}: StUint
+  divRem(result.limbs, tmp.limbs, x.limbs, y.limbs)
+
+func `mod`*(x, y: StUint): StUint =
+  ## Remainder operation for multi-precision unsigned uint
+  var tmp{.noinit.}: StUint
+  divRem(tmp.limbs, result.limbs, x.limbs, y.limbs)
+
+func divmod*(x, y: StUint): tuple[quot, rem: StUint] =
+  ## Division and remainder operations for multi-precision unsigned uint
+  divRem(result.quot.limbs, result.rem.limbs, x.limbs, y.limbs)
+
+{.pop.}
\ No newline at end of file
diff --git a/tests/all_tests.nim b/tests/all_tests.nim
index 45b138d..b6ba60c 100644
--- a/tests/all_tests.nim
+++ b/tests/all_tests.nim
@@ -7,16 +7,19 @@
 #
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import  test_uint_bitops2,
-        test_uint_endianness,
-        test_uint_comparison,
-        test_uint_bitwise,
-        test_uint_addsub,
-        test_uint_muldiv,
-        test_uint_exp,
-        test_uint_modular_arithmetic,
-        test_uint_endians2
+import  
+  test_uint_addsub,
+  test_uint_bitops2,
+  test_uint_bitwise,
+  test_uint_comparison,
+  #test_uint_divmod,
+  test_uint_endianness,
+  test_uint_endians2,        
+  test_uint_exp,
+  #test_uint_modular_arithmetic,
+  test_uint_mul
 
+#[
 import  test_int_endianness,
         test_int_comparison,
         test_int_addsub,
@@ -26,3 +29,4 @@ import  test_int_endianness,
 
 import  test_io,
         test_conversion
+]#
diff --git a/tests/property_based.nim b/tests/property_based.nim
deleted file mode 100644
index 640c3f3..0000000
--- a/tests/property_based.nim
+++ /dev/null
@@ -1,238 +0,0 @@
-# Stint
-# Copyright 2018 Status Research & Development GmbH
-# Licensed under either of
-#
-#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
-#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
-#
-# at your option. This file may not be copied, modified, or distributed except according to those terms.
-
-import ../stint, unittest, quicktest, math
-
-const itercount = 1000
-
-suite "Property-based testing (testing with random inputs) - uint64 on 64-bit / uint32 on 32-bit":
-
-  when defined(release):
-    echo "Testing in release mode with " & $itercount & " random tests for each proc."
-  else:
-    echo "Testing in debug mode " & $itercount & " random tests for each proc. (StUint[64] = 2x uint32)"
-  when defined(stint_test):
-    echo "(StUint[64] = 2x uint32)"
-  else:
-    echo "(StUint[64] = uint64)"
-
-  let hi = 1'u shl (sizeof(uint)*7)
-
-  quicktest "`or`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx or ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx or ty
-
-
-    check(cast[uint](tz) == (x or y))
-
-
-  quicktest "`and`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx and ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx and ty
-
-    check(cast[uint](tz) == (x and y))
-
-  quicktest "`xor`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx xor ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx xor ty
-
-    check(cast[uint](tz) == (x xor y))
-
-  quicktest "`not`", itercount do(x: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = not tx
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = not tx
-
-    check(cast[uint](tz) == (not x))
-
-  quicktest "`<`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx < ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx < ty
-
-    check(tz == (x < y))
-
-
-  quicktest "`<=`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx <= ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx <= ty
-
-    check(tz == (x <= y))
-
-  quicktest "`+`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx + ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx + ty
-
-    check(cast[uint](tz) == x+y)
-
-
-  quicktest "`-`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx - ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx - ty
-
-    check(cast[uint](tz) == x-y)
-
-  quicktest "`*`", itercount do(x: uint(min=0, max=hi), y: uint(min=0, max=hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx * ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx * ty
-
-    check(cast[uint](tz) == x*y)
-
-  quicktest "`shl`", itercount do(x: uint(min=0, max=hi), y: int(min = 0, max=(sizeof(int)*8-1))):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = tx shl y
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = tx shl y
-
-    check(cast[uint](tz) == x shl y)
-
-  quicktest "`shr`", itercount do(x: uint(min=0, max=hi), y: int(min = 0, max=(sizeof(int)*8-1))):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = tx shr y
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = tx shr y
-
-    check(cast[uint](tz) == x shr y)
-
-  quicktest "`mod`", itercount do(x: uint(min=0, max=hi), y: uint(min = 1, max = hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx mod ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx mod ty
-
-    check(cast[uint](tz) == x mod y)
-
-  quicktest "`div`", itercount do(x: uint(min=0, max=hi), y: uint(min = 1, max = hi)):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        ty = cast[StUint[64]](y)
-        tz = tx div ty
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        ty = cast[StUint[32]](y)
-        tz = tx div ty
-
-    check(cast[uint](tz) == x div y)
-
-  quicktest "pow", itercount do(x: uint(min=0, max=hi), y: int(min = 0, max = high(int))):
-
-    when sizeof(int) == 8:
-      let
-        tx = cast[StUint[64]](x)
-        tz = tx.pow(y)
-
-        ty = cast[StUint[64]](y)
-        tz2 = tx.pow(ty)
-    else:
-      let
-        tx = cast[StUint[32]](x)
-        tz = tx.pow(y)
-
-        ty = cast[StUint[32]](y)
-        tz2 = tx.pow(ty)
-
-    check(cast[uint](tz) == x ^ y)
-    check(cast[uint](tz2) == x ^ y)
diff --git a/tests/t_randomized_divmod.nim b/tests/t_randomized_divmod.nim
new file mode 100644
index 0000000..c0e2e71
--- /dev/null
+++ b/tests/t_randomized_divmod.nim
@@ -0,0 +1,46 @@
+# Stint
+# Copyright 2022 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[unittest, times],
+  # Internal
+  ../stint,
+  # Test utilities
+  ../helpers/prng_unsafe
+
+const Iters = 50000
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "t_randomized_divmod xoshiro512** seed: ", seed
+
+proc test_divmod(bits: static int, iters: int, gen: RandomGen) =
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    try:
+      let (q, r) = divmod(a, b)
+      doAssert a == q*b + r
+    except DivByZeroDefect:
+      doAssert b.isZero()
+    
+template test(bits: static int) =
+  test "(q, r) = divmod(a, b) <=> a = q*b + r (" & $bits & " bits)":
+    test_divmod(bits, Iters, Uniform)
+    test_divmod(bits, Iters, HighHammingWeight)
+    test_divmod(bits, Iters, Long01Sequence)
+
+suite "Randomized division and modulo checks":
+  test(128)
+  test(256)
+  test(512)
\ No newline at end of file
diff --git a/tests/t_randomized_vs_gmp.nim b/tests/t_randomized_vs_gmp.nim
new file mode 100644
index 0000000..7638031
--- /dev/null
+++ b/tests/t_randomized_vs_gmp.nim
@@ -0,0 +1,197 @@
+# Stint
+# Copyright (c) 2018-2022    Status Research & Development GmbH
+# 
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/[unittest, times, strutils],
+  # Third-party
+  gmp, stew/byteutils,
+  # Internal
+  ../stint,
+  # Test utilities
+  ../helpers/[prng_unsafe, staticfor]
+
+const
+  Iters = 1000
+  Bitwidths = [128, 256, 512, 1024, 2048]
+
+const # https://gmplib.org/manual/Integer-Import-and-Export.html
+  GMP_WordLittleEndian {.used.} = -1'i32
+  GMP_WordNativeEndian {.used.} = 0'i32
+  GMP_WordBigEndian {.used.} = 1'i32
+
+  GMP_MostSignificantWordFirst = 1'i32
+  GMP_LeastSignificantWordFirst {.used.} = -1'i32
+
+var rng: RngState
+let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+rng.seed(seed)
+echo "\n------------------------------------------------------\n"
+echo "t_randomized_vs_gmp xoshiro512** seed: ", seed
+
+proc rawUint(dst: var openArray[byte], src: mpz_t): csize =
+  ## Converts a GMP bigint to a canonical integer as a BigEndian array of byte
+  ## Returns the number of words actually written
+  discard mpz_export(dst[0].addr, result.addr, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, src)
+
+proc fromStuint[bits: static int](dst: var mpz_t, src: Stuint[bits]) =
+  let t = src.toBytes()
+  mpz_import(dst, t.len, GMP_MostSignificantWordFirst, 1, GMP_WordNativeEndian, 0, t[0].addr)
+
+  # Sanity check
+  var t2: typeof(t)
+  let wordsWritten = t2.rawUint(dst)
+  # Note: in bigEndian, GMP aligns left while Stint aligns right
+  doAssert t2.toOpenArray(0, wordsWritten-1) == t.toOpenArray(t.len-wordsWritten, t.len-1)
+
+proc test_add(bits: static int, iters: int, gen: RandomGen) =
+  
+  const N = (bits + 7) div 8
+
+  var x, y, z, m: mpz_t
+  mpz_init(x)
+  mpz_init(y)
+  mpz_init(z)
+  mpz_init(m)
+  mpz_ui_pow_ui(m, 2, bits) # 2^bits
+  
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    x.fromStuint(a)
+    y.fromStuint(b)
+
+    let c = a + b
+    mpz_add(z, x, y)
+    mpz_mod(z, z, m)
+
+    let cBytes = c.toBytes()
+
+    var zBytes: array[N, byte]
+    let wordsWritten = zBytes.rawUint(z)
+
+    # Note: in bigEndian, GMP aligns left while Stint aligns right
+    doAssert zBytes.toOpenArray(0, wordsWritten-1) == cBytes.toOpenArray(N-wordsWritten, N-1), block:
+      # Reexport as bigEndian for debugging
+      var xBuf, yBuf: array[N, byte]
+      discard xBuf.rawUint(x)
+      discard yBuf.rawUint(y)
+      "\nAddition with operands\n" &
+      "  x (" & align($bits, 4) & "-bit):   0x" & xBuf.toHex & "\n" &
+      "  y (" & align($bits, 4) & "-bit):   0x" & yBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            0x" & zBytes.toHex() & "\n" &
+      "  Stint:          0x" & cBytes.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while Stint aligns bytes right)"
+
+template testAddition(bits: static int) =
+  test "Addition vs GMP (" & $bits & " bits)":
+    test_add(bits, Iters, Uniform)
+    test_add(bits, Iters, HighHammingWeight)
+    test_add(bits, Iters, Long01Sequence)
+
+proc test_sub(bits: static int, iters: int, gen: RandomGen) =
+  
+  const N = (bits + 7) div 8
+
+  var x, y, z, m: mpz_t
+  mpz_init(x)
+  mpz_init(y)
+  mpz_init(z)
+  mpz_init(m)
+  mpz_ui_pow_ui(m, 2, bits) # 2^bits
+  
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    x.fromStuint(a)
+    y.fromStuint(b)
+
+    let c = a - b
+    mpz_sub(z, x, y)
+    mpz_mod(z, z, m)
+
+    let cBytes = c.toBytes()
+
+    var zBytes: array[N, byte]
+    let wordsWritten = zBytes.rawUint(z)
+
+    # Note: in bigEndian, GMP aligns left while Stint aligns right
+    doAssert zBytes.toOpenArray(0, wordsWritten-1) == cBytes.toOpenArray(N-wordsWritten, N-1), block:
+      # Reexport as bigEndian for debugging
+      var xBuf, yBuf: array[N, byte]
+      discard xBuf.rawUint(x)
+      discard yBuf.rawUint(y)
+      "\nSubstraction with operands\n" &
+      "  x (" & align($bits, 4) & "-bit):   0x" & xBuf.toHex & "\n" &
+      "  y (" & align($bits, 4) & "-bit):   0x" & yBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            0x" & zBytes.toHex() & "\n" &
+      "  Stint:          0x" & cBytes.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while Stint aligns bytes right)"
+
+template testSubstraction(bits: static int) =
+  test "Substaction vs GMP (" & $bits & " bits)":
+    test_sub(bits, Iters, Uniform)
+    test_sub(bits, Iters, HighHammingWeight)
+    test_sub(bits, Iters, Long01Sequence)
+
+proc test_mul(bits: static int, iters: int, gen: RandomGen) =
+  
+  const N = (bits + 7) div 8
+
+  var x, y, z, m: mpz_t
+  mpz_init(x)
+  mpz_init(y)
+  mpz_init(z)
+  mpz_init(m)
+  mpz_ui_pow_ui(m, 2, bits) # 2^bits
+  
+  for _ in 0 ..< iters:
+    let a = rng.random_elem(Stuint[bits], gen)
+    let b = rng.random_elem(Stuint[bits], gen)
+
+    x.fromStuint(a)
+    y.fromStuint(b)
+
+    let c = a * b
+    mpz_mul(z, x, y)
+    mpz_mod(z, z, m)
+
+    let cBytes = c.toBytes()
+
+    var zBytes: array[N, byte]
+    let wordsWritten = zBytes.rawUint(z)
+
+    # Note: in bigEndian, GMP aligns left while Stint aligns right
+    doAssert zBytes.toOpenArray(0, wordsWritten-1) == cBytes.toOpenArray(N-wordsWritten, N-1), block:
+      # Reexport as bigEndian for debugging
+      var xBuf, yBuf: array[N, byte]
+      discard xBuf.rawUint(x)
+      discard yBuf.rawUint(y)
+      "\nMultiplication with operands\n" &
+      "  x (" & align($bits, 4) & "-bit):   0x" & xBuf.toHex & "\n" &
+      "  y (" & align($bits, 4) & "-bit):   0x" & yBuf.toHex & "\n" &
+      "failed:" & "\n" &
+      "  GMP:            0x" & zBytes.toHex() & "\n" &
+      "  Stint:          0x" & cBytes.toHex() & "\n" &
+      "(Note that GMP aligns bytes left while Stint aligns bytes right)"
+
+template testMultiplication(bits: static int) =
+  test "Multiplication vs GMP (" & $bits & " bits)":
+    test_mul(bits, Iters, Uniform)
+    test_mul(bits, Iters, HighHammingWeight)
+    test_mul(bits, Iters, Long01Sequence)
+
+suite "Randomized arithmetic tests vs GMP":
+  staticFor i, 0, Bitwidths.len:
+    testAddition(Bitwidths[i])
+    testSubstraction(Bitwidths[i])
+    testMultiplication(Bitwidths[i])
\ No newline at end of file
diff --git a/tests/property_based_uint256.nim b/tests/test_uint256_ttmath.nim
similarity index 99%
rename from tests/property_based_uint256.nim
rename to tests/test_uint256_ttmath.nim
index d722a06..8203737 100644
--- a/tests/property_based_uint256.nim
+++ b/tests/test_uint256_ttmath.nim
@@ -20,10 +20,6 @@ suite "Property-based testing (testing with random inputs) of Uint256":
     echo "Testing in release mode with " & $itercount & " random tests for each proc."
   else:
     echo "Testing in debug mode " & $itercount & " random tests for each proc. (StUint[64] = 2x uint32)"
-  when defined(stint_test):
-    echo "(StUint[64] = 2x uint32)"
-  else:
-    echo "(StUint[64] = uint64)"
 
   let hi = 1'u shl (sizeof(uint64)*7)
 
diff --git a/tests/test_uint_addsub.nim b/tests/test_uint_addsub.nim
index b78eafc..c5c09bf 100644
--- a/tests/test_uint_addsub.nim
+++ b/tests/test_uint_addsub.nim
@@ -35,7 +35,7 @@ template chkInplaceSubstraction(chk, a, b, c, bits: untyped) =
 
 template testAddSub(chk, tst: untyped) =
   tst "addition":
-    chkAddition(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkAddition(chk, 0'u8, 0'u8, 0'u8, 8)
     chkAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 8)
     chkAddition(chk, low(uint8), 17'u8, low(uint8) + 17'u8, 8)
 
@@ -61,7 +61,7 @@ template testAddSub(chk, tst: untyped) =
     chkAddition(chk, high(uint32) - 17'u32, 17'u32, high(uint32), 64)
     chkAddition(chk, low(uint32), 17'u32, low(uint32) + 17'u32, 64)
     chkAddition(chk, high(uint64) - 17'u64, 17'u64, high(uint64), 64)
-    chkAddition(chk, low(uint64), 17'u64, low(uint64) + 17'u64, 64)
+    chkAddition(chk, low(uint64), 17'u64, low(uint64) + 17'u64, 64)]#
 
     chkAddition(chk, 0'u8, 0'u8, 0'u8, 128)
     chkAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 128)
@@ -74,7 +74,7 @@ template testAddSub(chk, tst: untyped) =
     chkAddition(chk, low(uint64), 17'u64, low(uint64) + 17'u64, 128)
 
   tst "inplace addition":
-    chkInplaceAddition(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkInplaceAddition(chk, 0'u8, 0'u8, 0'u8, 8)
     chkInplaceAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 8)
     chkInplaceAddition(chk, low(uint8) + 17'u8, 17'u8, low(uint8) + 34'u8, 8)
 
@@ -100,7 +100,7 @@ template testAddSub(chk, tst: untyped) =
     chkInplaceAddition(chk, high(uint32) - 17'u32, 17'u32, high(uint32), 64)
     chkInplaceAddition(chk, low(uint32) + 17'u32, 17'u32, low(uint32) + 34'u32, 64)
     chkInplaceAddition(chk, high(uint64) - 17'u64, 17'u64, high(uint64), 64)
-    chkInplaceAddition(chk, low(uint64) + 17'u64, 17'u64, low(uint64) + 34'u64, 64)
+    chkInplaceAddition(chk, low(uint64) + 17'u64, 17'u64, low(uint64) + 34'u64, 64)]#
 
     chkInplaceAddition(chk, 0'u8, 0'u8, 0'u8, 128)
     chkInplaceAddition(chk, high(uint8) - 17'u8, 17'u8, high(uint8), 128)
@@ -113,7 +113,7 @@ template testAddSub(chk, tst: untyped) =
     chkInplaceAddition(chk, low(uint64) + 17'u64, 17'u64, low(uint64) + 34'u64, 128)
 
   tst "substraction":
-    chkSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
     chkSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 8)
     chkSubstraction(chk, low(uint8) + 17'u8, 17'u8, low(uint8), 8)
 
@@ -139,7 +139,7 @@ template testAddSub(chk, tst: untyped) =
     chkSubstraction(chk, high(uint32) - 17'u32, 17'u32, high(uint32) - 34'u32, 64)
     chkSubstraction(chk, low(uint32) + 17'u32, 17'u32, low(uint32), 64)
     chkSubstraction(chk, high(uint64) - 17'u64, 17'u64, high(uint64) - 34'u64, 64)
-    chkSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)
+    chkSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)]#
 
     chkSubstraction(chk, 0'u8, 0'u8, 0'u8, 128)
     chkSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 128)
@@ -152,7 +152,7 @@ template testAddSub(chk, tst: untyped) =
     chkSubstraction(chk, high(uint64), high(uint64), 0'u64, 128)
 
   tst "inplace substraction":
-    chkInplaceSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
+    #[chkInplaceSubstraction(chk, 0'u8, 0'u8, 0'u8, 8)
     chkInplaceSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 8)
     chkInplaceSubstraction(chk, low(uint8) + 17'u8, 17'u8, low(uint8), 8)
 
@@ -178,7 +178,7 @@ template testAddSub(chk, tst: untyped) =
     chkInplaceSubstraction(chk, high(uint32) - 17'u32, 17'u32, high(uint32) - 34'u32, 64)
     chkInplaceSubstraction(chk, low(uint32) + 17'u32, 17'u32, low(uint32), 64)
     chkInplaceSubstraction(chk, high(uint64) - 17'u64, 17'u64, high(uint64) - 34'u64, 64)
-    chkInplaceSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)
+    chkInplaceSubstraction(chk, low(uint64) + 17'u64, 17'u64, low(uint64), 64)]#
 
     chkInplaceSubstraction(chk, 0'u8, 0'u8, 0'u8, 128)
     chkInplaceSubstraction(chk, high(uint8) - 17'u8, 17'u8, high(uint8) - 34'u8, 128)
@@ -196,6 +196,7 @@ static:
 suite "Wider unsigned int addsub coverage":
   testAddSub(check, test)
 
+#[
 suite "Testing unsigned int addition implementation":
   test "In-place addition gives expected result":
 
@@ -261,3 +262,4 @@ suite "Testing unsigned int substraction implementation":
     let b = 101'u16.stuint(16)
 
     check: cast[uint16](a-b) == high(uint16)
+]#
diff --git a/tests/test_uint_bitwise.nim b/tests/test_uint_bitwise.nim
index 0f9d2af..8e72fec 100644
--- a/tests/test_uint_bitwise.nim
+++ b/tests/test_uint_bitwise.nim
@@ -47,7 +47,7 @@ template testBitwise(chk, tst: untyped) =
   #chkShr(chk, "F0000000000000000000000000000000", 128, "00", 128)
 
   tst "operator `not`":
-    chkNot(chk, 0'u8, not 0'u8, 8)
+    #[chkNot(chk, 0'u8, not 0'u8, 8)
     chkNot(chk, high(uint8), not high(uint8), 8)
     chkNot(chk, "F0", "0F", 8)
     chkNot(chk, "0F", "F0", 8)
@@ -83,7 +83,7 @@ template testBitwise(chk, tst: untyped) =
     chkNot(chk, high(uint8), not uint64(high(uint8)), 64)
     chkNot(chk, high(uint16), not uint64(high(uint16)), 64)
     chkNot(chk, high(uint32), not uint64(high(uint32)), 64)
-    chkNot(chk, high(uint64), not high(uint64), 64)
+    chkNot(chk, high(uint64), not high(uint64), 64)]#
 
     chkNot(chk, "0", "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", 128)
     chkNot(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "0", 128)
@@ -91,7 +91,7 @@ template testBitwise(chk, tst: untyped) =
     chkNot(chk, "FFFFFFFFFFFF00000000000000000000", "000000000000FFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `or`":
-    chkOr(chk, "00", "FF", "FF", 8)
+    #[chkOr(chk, "00", "FF", "FF", 8)
     chkOr(chk, "FF", "00", "FF", 8)
     chkOr(chk, "F0", "0F", "FF", 8)
     chkOr(chk, "00", "00", "00", 8)
@@ -114,7 +114,7 @@ template testBitwise(chk, tst: untyped) =
     chkOr(chk, "F0", "0F", "00000000000000FF", 64)
     chkOr(chk, "00", "00", "0000000000000000", 64)
     chkOr(chk, "FF00", "0F00", "000000000000FF00", 64)
-    chkOr(chk, "00FF00FF", "000F000F", "0000000000FF00FF", 64)
+    chkOr(chk, "00FF00FF", "000F000F", "0000000000FF00FF", 64)]#
 
     chkOr(chk, "00", "FF", "000000000000000000000000000000FF", 128)
     chkOr(chk, "FF", "00", "000000000000000000000000000000FF", 128)
@@ -125,7 +125,7 @@ template testBitwise(chk, tst: untyped) =
     chkOr(chk, "00000000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", 128)
 
   tst "operator `and`":
-    chkAnd(chk, "00", "FF", "00", 8)
+    #[chkAnd(chk, "00", "FF", "00", 8)
     chkAnd(chk, "FF", "00", "00", 8)
     chkAnd(chk, "F0", "0F", "00", 8)
     chkAnd(chk, "00", "00", "00", 8)
@@ -150,7 +150,7 @@ template testBitwise(chk, tst: untyped) =
     chkAnd(chk, "F0", "0F", "0000000000000000", 64)
     chkAnd(chk, "00", "00", "0000000000000000", 64)
     chkAnd(chk, "FF00", "0F00", "0000000000000F00", 64)
-    chkAnd(chk, "00FF00FF", "000F000F", "00000000000F000F", 64)
+    chkAnd(chk, "00FF00FF", "000F000F", "00000000000F000F", 64)]#
 
     chkAnd(chk, "00", "FF", "00000000000000000000000000000000", 128)
     chkAnd(chk, "FF", "00", "00000000000000000000000000000000", 128)
@@ -161,7 +161,7 @@ template testBitwise(chk, tst: untyped) =
     chkAnd(chk, "F0000000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", "F0000000000000000000000000FF00FF", 128)
 
   tst "operator `xor`":
-    chkXor(chk, "00", "FF", "FF", 8)
+    #[chkXor(chk, "00", "FF", "FF", 8)
     chkXor(chk, "FF", "00", "FF", 8)
     chkXor(chk, "F0", "0F", "FF", 8)
     chkXor(chk, "00", "00", "00", 8)
@@ -186,7 +186,7 @@ template testBitwise(chk, tst: untyped) =
     chkXor(chk, "F0", "0F", "00000000000000FF", 64)
     chkXor(chk, "00", "00", "0000000000000000", 64)
     chkXor(chk, "FF00", "0F00", "000000000000F000", 64)
-    chkXor(chk, "00FF00FF", "000F000F", "0000000000F000F0", 64)
+    chkXor(chk, "00FF00FF", "000F000F", "0000000000F000F0", 64)]#
 
     chkXor(chk, "00", "FF", "000000000000000000000000000000FF", 128)
     chkXor(chk, "FF", "00", "000000000000000000000000000000FF", 128)
@@ -197,7 +197,7 @@ template testBitwise(chk, tst: untyped) =
     chkXor(chk, "F0000000000000000000000000FF00FF", "FF0F0000000000000000000000FF00FF", "0F0F0000000000000000000000000000", 128)
 
   tst "operator `shl`":
-    chkShl(chk, "0F", 4, "F0", 8)
+    #[chkShl(chk, "0F", 4, "F0", 8)
     chkShl(chk, "F0", 4, "00", 8)
     chkShl(chk, "F0", 3, "80", 8)
     chkShl(chk, "0F", 7, "80", 8)
@@ -226,7 +226,7 @@ template testBitwise(chk, tst: untyped) =
     chkShl(chk, "0F", 5, "1E0", 64)
     chkShl(chk, "0F", 9, "1E00", 64)
     chkShl(chk, "0F", 17, "1E0000", 64)
-    chkShl(chk, "0F", 33, "1E00000000", 64)
+    chkShl(chk, "0F", 33, "1E00000000", 64)]#
 
     chkShl(chk, "0F", 4, "F0", 128)
     chkShl(chk, "F0", 4, "F00", 128)
@@ -257,7 +257,7 @@ template testBitwise(chk, tst: untyped) =
     chkShl(chk, "0F", 255, "8000000000000000000000000000000000000000000000000000000000000000", 256)
 
   tst "operator `shr`":
-    chkShr(chk, "0F", 4, "00", 8)
+    #[chkShr(chk, "0F", 4, "00", 8)
     chkShr(chk, "F0", 4, "0F", 8)
     chkShr(chk, "F0", 3, "1E", 8)
     chkShr(chk, "F0", 7, "01", 8)
@@ -278,7 +278,7 @@ template testBitwise(chk, tst: untyped) =
     chkShr(chk, "F0", 3, "1E", 64)
     chkShr(chk, "F000", 3, "1E00", 64)
     chkShr(chk, "F0000000", 3, "1E000000", 64)
-    chkShr(chk, "F000000000000000", 63, "0000000000000001", 64)
+    chkShr(chk, "F000000000000000", 63, "0000000000000001", 64)]#
 
     chkShr(chk, "0F", 4, "00", 128)
     chkShr(chk, "F0", 4, "0F", 128)
@@ -311,6 +311,7 @@ static:
 suite "Wider unsigned int bitwise coverage":
   testBitwise(check, test)
 
+#[
 suite "Testing unsigned int bitwise operations":
   let a = 100'i16.stuint(16)
 
@@ -337,12 +338,6 @@ suite "Testing unsigned int bitwise operations":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shl 8) == z shl 8
 
-    block: # Testing shl for nested UintImpl
-      let p2_64 = UintImpl[uint64](hi:1, lo:0)
-      let p = 1.stuint(128) shl 64
-
-      check: p == cast[StUint[128]](p2_64)
-
   test "Shift right - by less than half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shr 2) == z shr 2
@@ -354,3 +349,4 @@ suite "Testing unsigned int bitwise operations":
   test "Shift right - by half the size of the integer":
     check: cast[uint16](b) == z # Sanity check
     check: cast[uint16](b shr 8) == z shr 8
+]#
\ No newline at end of file
diff --git a/tests/test_uint_comparison.nim b/tests/test_uint_comparison.nim
index 7599c6f..39d850e 100644
--- a/tests/test_uint_comparison.nim
+++ b/tests/test_uint_comparison.nim
@@ -47,7 +47,7 @@ template chkNotIsEven(chk: untyped, a: string, bits: int) =
 
 template testComparison(chk, tst: untyped) =
   tst "operator `LT`":
-    chkLT(chk, "0", "F", 8)
+    #[chkLT(chk, "0", "F", 8)
     chkLT(chk, "F", "FF", 8)
 
     chkLT(chk, "0", "F", 16)
@@ -63,7 +63,7 @@ template testComparison(chk, tst: untyped) =
     chkLT(chk, "F", "FF", 64)
     chkLT(chk, "FF", "FFF", 64)
     chkLT(chk, "FFFF", "FFFFF", 64)
-    chkLT(chk, "FFFFF", "FFFFFFFF", 64)
+    chkLT(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkLT(chk, "0", "F", 128)
     chkLT(chk, "F", "FF", 128)
@@ -73,7 +73,7 @@ template testComparison(chk, tst: untyped) =
     chkLT(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator not `LT`":
-    chkNotLT(chk, "0", "F", 8)
+    #[chkNotLT(chk, "0", "F", 8)
     chkNotLT(chk, "F", "FF", 8)
 
     chkNotLT(chk, "0", "F", 16)
@@ -89,7 +89,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLT(chk, "F", "FF", 64)
     chkNotLT(chk, "FF", "FFF", 64)
     chkNotLT(chk, "FFFF", "FFFFF", 64)
-    chkNotLT(chk, "FFFFF", "FFFFFFFF", 64)
+    chkNotLT(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkNotLT(chk, "0", "F", 128)
     chkNotLT(chk, "F", "FF", 128)
@@ -99,7 +99,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLT(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `LTE`":
-    chkLTE(chk, "0", "F", 8)
+    #[chkLTE(chk, "0", "F", 8)
     chkLTE(chk, "F", "FF", 8)
     chkLTE(chk, "F", "F", 8)
 
@@ -119,7 +119,7 @@ template testComparison(chk, tst: untyped) =
     chkLTE(chk, "FF", "FFF", 64)
     chkLTE(chk, "FFFF", "FFFFF", 64)
     chkLTE(chk, "FFFFF", "FFFFFFFF", 64)
-    chkLTE(chk, "FFFFFFFF", "FFFFFFFF", 64)
+    chkLTE(chk, "FFFFFFFF", "FFFFFFFF", 64)]#
 
     chkLTE(chk, "0", "F", 128)
     chkLTE(chk, "F", "FF", 128)
@@ -130,7 +130,7 @@ template testComparison(chk, tst: untyped) =
     chkLTE(chk, "FFFFFFFFFFFFFFFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator not `LTE`":
-    chkNotLTE(chk, "0", "F", 8)
+    #[chkNotLTE(chk, "0", "F", 8)
     chkNotLTE(chk, "F", "FF", 8)
 
     chkNotLTE(chk, "0", "F", 16)
@@ -146,7 +146,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLTE(chk, "F", "FF", 64)
     chkNotLTE(chk, "FF", "FFF", 64)
     chkNotLTE(chk, "FFFF", "FFFFF", 64)
-    chkNotLTE(chk, "FFFFF", "FFFFFFFF", 64)
+    chkNotLTE(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkNotLTE(chk, "0", "F", 128)
     chkNotLTE(chk, "F", "FF", 128)
@@ -156,7 +156,7 @@ template testComparison(chk, tst: untyped) =
     chkNotLTE(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `EQ`":
-    chkEQ(chk, "0", "0", 8)
+    #[chkEQ(chk, "0", "0", 8)
     chkEQ(chk, "FF", "FF", 8)
     chkEQ(chk, "F", "F", 8)
 
@@ -176,7 +176,7 @@ template testComparison(chk, tst: untyped) =
     chkEQ(chk, "FF", "FF", 64)
     chkEQ(chk, "FFFF", "FFFF", 64)
     chkEQ(chk, "FFFFF", "FFFFF", 64)
-    chkEQ(chk, "FFFFFFFF", "FFFFFFFF", 64)
+    chkEQ(chk, "FFFFFFFF", "FFFFFFFF", 64)]#
 
     chkEQ(chk, "0", "0", 128)
     chkEQ(chk, "F", "F", 128)
@@ -186,7 +186,7 @@ template testComparison(chk, tst: untyped) =
     chkEQ(chk, "FFFFFFFFFFFFFFFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator not `EQ`":
-    chkNotEQ(chk, "0", "F", 8)
+    #[chkNotEQ(chk, "0", "F", 8)
     chkNotEQ(chk, "F", "FF", 8)
 
     chkNotEQ(chk, "0", "F", 16)
@@ -202,7 +202,7 @@ template testComparison(chk, tst: untyped) =
     chkNotEQ(chk, "F", "FF", 64)
     chkNotEQ(chk, "FF", "FFF", 64)
     chkNotEQ(chk, "FFFF", "FFFFF", 64)
-    chkNotEQ(chk, "FFFFF", "FFFFFFFF", 64)
+    chkNotEQ(chk, "FFFFF", "FFFFFFFF", 64)]#
 
     chkNotEQ(chk, "0", "F", 128)
     chkNotEQ(chk, "F", "FF", 128)
@@ -212,92 +212,92 @@ template testComparison(chk, tst: untyped) =
     chkNotEQ(chk, "FFFFFFFFFFF", "FFFFFFFFFFFFFFFFFFFFFFFF", 128)
 
   tst "operator `isZero`":
-    chkIsZero(chk, "0", 8)
+    #[chkIsZero(chk, "0", 8)
     chkIsZero(chk, "0", 16)
     chkIsZero(chk, "0", 32)
-    chkIsZero(chk, "0", 64)
+    chkIsZero(chk, "0", 64)]#
     chkIsZero(chk, "0", 128)
     chkIsZero(chk, "0", 256)
 
   tst "operator not `isZero`":
-    chkNotIsZero(chk, "1", 8)
+    #[chkNotIsZero(chk, "1", 8)
     chkNotIsZero(chk, "2", 16)
     chkNotIsZero(chk, "3", 32)
-    chkNotIsZero(chk, "4", 64)
+    chkNotIsZero(chk, "4", 64)]#
     chkNotIsZero(chk, "5", 128)
     chkNotIsZero(chk, "6", 256)
 
   tst "operator `isOdd`":
-    chkIsOdd(chk, "1", 8)
+    #[chkIsOdd(chk, "1", 8)
     chkIsOdd(chk, "1", 16)
     chkIsOdd(chk, "1", 32)
-    chkIsOdd(chk, "1", 64)
+    chkIsOdd(chk, "1", 64)]#
     chkIsOdd(chk, "1", 128)
     chkIsOdd(chk, "1", 256)
 
-    chkIsOdd(chk, "FF", 8)
+    #[chkIsOdd(chk, "FF", 8)
     chkIsOdd(chk, "FFF", 16)
     chkIsOdd(chk, "FFFFF", 32)
-    chkIsOdd(chk, "FFFFFF", 64)
+    chkIsOdd(chk, "FFFFFF", 64)]#
     chkIsOdd(chk, "FFFFFFFFFFFFFFF", 128)
     chkIsOdd(chk, "FFFFFFFFFFFFFFFFFF", 256)
 
   tst "operator not `isOdd`":
-    chkNotIsOdd(chk, "0", 8)
+    #[chkNotIsOdd(chk, "0", 8)
     chkNotIsOdd(chk, "0", 16)
     chkNotIsOdd(chk, "0", 32)
-    chkNotIsOdd(chk, "0", 64)
+    chkNotIsOdd(chk, "0", 64)]#
     chkNotIsOdd(chk, "0", 128)
     chkNotIsOdd(chk, "0", 256)
 
-    chkNotIsOdd(chk, "4", 8)
+    #[chkNotIsOdd(chk, "4", 8)
     chkNotIsOdd(chk, "4", 16)
     chkNotIsOdd(chk, "4", 32)
-    chkNotIsOdd(chk, "4", 64)
+    chkNotIsOdd(chk, "4", 64)]#
     chkNotIsOdd(chk, "4", 128)
     chkNotIsOdd(chk, "4", 256)
 
-    chkNotIsOdd(chk, "A", 8)
+    #[chkNotIsOdd(chk, "A", 8)
     chkNotIsOdd(chk, "AAA", 16)
     chkNotIsOdd(chk, "AAAA", 32)
-    chkNotIsOdd(chk, "FFFFFA", 64)
+    chkNotIsOdd(chk, "FFFFFA", 64)]#
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFA", 128)
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFFFFA", 256)
 
   tst "operator `isEven`":
-    chkNotIsOdd(chk, "0", 8)
+    #[chkNotIsOdd(chk, "0", 8)
     chkNotIsOdd(chk, "0", 16)
     chkNotIsOdd(chk, "0", 32)
-    chkNotIsOdd(chk, "0", 64)
+    chkNotIsOdd(chk, "0", 64)]#
     chkNotIsOdd(chk, "0", 128)
     chkNotIsOdd(chk, "0", 256)
 
-    chkNotIsOdd(chk, "4", 8)
+    #[chkNotIsOdd(chk, "4", 8)
     chkNotIsOdd(chk, "4", 16)
     chkNotIsOdd(chk, "4", 32)
-    chkNotIsOdd(chk, "4", 64)
+    chkNotIsOdd(chk, "4", 64)]#
     chkNotIsOdd(chk, "4", 128)
     chkNotIsOdd(chk, "4", 256)
 
-    chkNotIsOdd(chk, "A", 8)
+    #[chkNotIsOdd(chk, "A", 8)
     chkNotIsOdd(chk, "AAA", 16)
     chkNotIsOdd(chk, "AAAA", 32)
-    chkNotIsOdd(chk, "FFFFFA", 64)
+    chkNotIsOdd(chk, "FFFFFA", 64)]#
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFA", 128)
     chkNotIsOdd(chk, "FFFFFFFFFFFFFFFFFA", 256)
 
   tst "operator not `isEven`":
-    chkIsOdd(chk, "1", 8)
+    #[chkIsOdd(chk, "1", 8)
     chkIsOdd(chk, "1", 16)
     chkIsOdd(chk, "1", 32)
-    chkIsOdd(chk, "1", 64)
+    chkIsOdd(chk, "1", 64)]#
     chkIsOdd(chk, "1", 128)
     chkIsOdd(chk, "1", 256)
 
-    chkIsOdd(chk, "FF", 8)
+    #[chkIsOdd(chk, "FF", 8)
     chkIsOdd(chk, "FFF", 16)
     chkIsOdd(chk, "FFFFF", 32)
-    chkIsOdd(chk, "FFFFFF", 64)
+    chkIsOdd(chk, "FFFFFF", 64)]#
     chkIsOdd(chk, "FFFFFFFFFFFFFFF", 128)
     chkIsOdd(chk, "FFFFFFFFFFFFFFFFFF", 256)
 
@@ -307,6 +307,7 @@ static:
 suite "Wider unsigned int comparison coverage":
   testComparison(check, test)
 
+#[
 suite "Testing unsigned int comparison operators":
   let
     a = 10'i16.stuint(16)
@@ -358,5 +359,5 @@ suite "Testing unsigned int comparison operators":
       not a.isOdd
       b.isOdd
       not b.isEven
-      c.isEven
-      not c.isOdd
+      # c.isEven
+      # not c.isOdd]#
diff --git a/tests/test_uint_muldiv.nim b/tests/test_uint_divmod.nim
similarity index 72%
rename from tests/test_uint_muldiv.nim
rename to tests/test_uint_divmod.nim
index c45405d..4dae110 100644
--- a/tests/test_uint_muldiv.nim
+++ b/tests/test_uint_divmod.nim
@@ -9,9 +9,6 @@
 
 import ../stint, unittest, test_helpers
 
-template chkMul(chk: untyped, a, b, c: string, bits: int) =
-  chk (fromHex(StUint[bits], a) * fromHex(StUint[bits], b)) == fromHex(StUint[bits], c)
-
 template chkDiv(chk: untyped, a, b, c: string, bits: int) =
   chk (fromHex(StUint[bits], a) div fromHex(StUint[bits], b)) == fromHex(StUint[bits], c)
 
@@ -21,43 +18,9 @@ template chkMod(chk: untyped, a, b, c: string, bits: int) =
 template chkDivMod(chk: untyped, a, b, c, d: string, bits: int) =
   chk divmod(fromHex(StUint[bits], a), fromHex(StUint[bits], b)) == (fromHex(StUint[bits], c), fromHex(StUint[bits], d))
 
-template testMuldiv(chk, tst: untyped) =
-  tst "operator `mul`":
-    chkMul(chk, "0", "3", "0", 8)
-    chkMul(chk, "1", "3", "3", 8)
-    chkMul(chk, "64", "3", "2C", 8) # overflow
-
-    chkMul(chk, "0", "3", "0", 16)
-    chkMul(chk, "1", "3", "3", 16)
-    chkMul(chk, "64", "3", "12C", 16)
-    chkMul(chk, "1770", "46", "68A0", 16) # overflow
-
-    chkMul(chk, "0", "3", "0", 32)
-    chkMul(chk, "1", "3", "3", 32)
-    chkMul(chk, "64", "3", "12C", 32)
-    chkMul(chk, "1770", "46", "668A0", 32)
-    chkMul(chk, "13880", "13880", "7D784000", 32) # overflow
-
-    chkMul(chk, "0", "3", "0", 64)
-    chkMul(chk, "1", "3", "3", 64)
-    chkMul(chk, "64", "3", "12C", 64)
-    chkMul(chk, "1770", "46", "668A0", 64)
-    chkMul(chk, "13880", "13880", "17D784000", 64)
-    chkMul(chk, "3B9ACA00", "E8D4A51000", "35C9ADC5DEA00000", 64) # overflow
-
-    chkMul(chk, "0", "3", "0", 128)
-    chkMul(chk, "1", "3", "3", 128)
-    chkMul(chk, "64", "3", "12C", 128)
-    chkMul(chk, "1770", "46", "668A0", 128)
-    chkMul(chk, "13880", "13880", "17D784000", 128)
-    chkMul(chk, "3B9ACA00", "E8D4A51000", "3635C9ADC5DEA00000", 128)
-    chkMul(chk, "25295F0D1", "10", "25295F0D10", 128)
-    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "4b66dc33f6acdca5e20890f2a5210000", 128) # overflow
-
-    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "14b66dc33f6acdca5e20890f2a5210000", 256)
-
+template testdivmod(chk, tst: untyped) =
   tst "operator `div`":
-    chkDiv(chk, "0", "3", "0", 8)
+    #[chkDiv(chk, "0", "3", "0", 8)
     chkDiv(chk, "1", "3", "0", 8)
     chkDiv(chk, "3", "3", "1", 8)
     chkDiv(chk, "3", "1", "3", 8)
@@ -85,7 +48,7 @@ template testMuldiv(chk, tst: untyped) =
     chkDiv(chk, "FF", "3", "55", 64)
     chkDiv(chk, "FFFF", "3", "5555", 64)
     chkDiv(chk, "FFFFFFFF", "3", "55555555", 64)
-    chkDiv(chk, "FFFFFFFFFFFFFFFF", "3", "5555555555555555", 64)
+    chkDiv(chk, "FFFFFFFFFFFFFFFF", "3", "5555555555555555", 64)]#
 
     chkDiv(chk, "0", "3", "0", 128)
     chkDiv(chk, "1", "3", "0", 128)
@@ -98,7 +61,7 @@ template testMuldiv(chk, tst: untyped) =
     chkDiv(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "3", "55555555555555555555555555555555", 128)
 
   tst "operator `mod`":
-    chkMod(chk, "0", "3", "0", 8)
+    #[chkMod(chk, "0", "3", "0", 8)
     chkMod(chk, "1", "3", "1", 8)
     chkMod(chk, "3", "3", "0", 8)
     chkMod(chk, "3", "1", "0", 8)
@@ -138,7 +101,7 @@ template testMuldiv(chk, tst: untyped) =
     chkMod(chk, "FFFFFFFF", "3", "0", 64)
     chkMod(chk, "FFFFFFFF", "23", "A", 64)
     chkMod(chk, "FFFFFFFF", "27", "15", 64)
-    chkMod(chk, "FFFFFFFFFFFFFFFF", "27", "F", 64)
+    chkMod(chk, "FFFFFFFFFFFFFFFF", "27", "F", 64)]#
 
     chkMod(chk, "0", "3", "0", 128)
     chkMod(chk, "1", "3", "1", 128)
@@ -155,7 +118,7 @@ template testMuldiv(chk, tst: untyped) =
     chkMod(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "27", "15", 128)
 
   tst "operator `divmod`":
-    chkDivMod(chk, "0", "3", "0", "0", 8)
+    #[chkDivMod(chk, "0", "3", "0", "0", 8)
     chkDivMod(chk, "1", "3", "0", "1", 8)
     chkDivMod(chk, "3", "3", "1", "0", 8)
     chkDivMod(chk, "3", "1", "3", "0", 8)
@@ -195,7 +158,7 @@ template testMuldiv(chk, tst: untyped) =
     chkDivMod(chk, "FFFFFFFF", "3", "55555555", "0", 64)
     chkDivMod(chk, "FFFFFFFF", "23", "7507507", "0A", 64)
     chkDivMod(chk, "FFFFFFFF", "27", "6906906", "15", 64)
-    chkDivMod(chk, "FFFFFFFFFFFFFFFF", "27", "690690690690690", "F", 64)
+    chkDivMod(chk, "FFFFFFFFFFFFFFFF", "27", "690690690690690", "F", 64)]#
 
     chkDivMod(chk, "0", "3", "0", "0", 128)
     chkDivMod(chk, "1", "3", "0", "1", 128)
@@ -212,45 +175,12 @@ template testMuldiv(chk, tst: untyped) =
     chkDivMod(chk, "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "27", "6906906906906906906906906906906", "15", 128)
 
 static:
-  testMuldiv(ctCheck, ctTest)
+  testdivmod(ctCheck, ctTest)
 
 suite "Wider unsigned int muldiv coverage":
-  testMuldiv(check, test)
-
-suite "Testing unsigned int multiplication implementation":
-  test "Multiplication with result fitting in low half":
-
-    let a = 10000.stuint(64)
-    let b = 10000.stuint(64)
-
-    check: cast[uint64](a*b) == 100_000_000'u64 # need 27-bits
-
-  test "Multiplication with result overflowing low half":
-
-    let a = 1_000_000.stuint(64)
-    let b = 1_000_000.stuint(64)
-
-    check: cast[uint64](a*b) == 1_000_000_000_000'u64 # need 40 bits
-
-  test "Full overflow is handled like native unsigned types":
-
-    let a = 1_000_000_000.stuint(64)
-    let b = 1_000_000_000.stuint(64)
-    let c = 1_000.stuint(64)
-
-    let x = 1_000_000_000'u64
-    let y = 1_000_000_000'u64
-    let z = 1_000'u64
-    let w = x*y*z
-
-    #check: cast[uint64](a*b*c) == 1_000_000_000_000_000_000_000'u64 # need 70-bits
-    check: cast[uint64](a*b*c) == w
-
-  test "Nim v1.0.2 32 bit type inference rule changed":
-    let x = 9975492817.stuint(256)
-    let y = 16.stuint(256)
-    check x * y == 159607885072.stuint(256)
+  testdivmod(check, test)
 
+#[
 suite "Testing unsigned int division and modulo implementation":
   test "Divmod(100, 13) returns the correct result":
 
@@ -261,19 +191,21 @@ suite "Testing unsigned int division and modulo implementation":
     check: cast[uint64](qr.quot) == 7'u64
     check: cast[uint64](qr.rem)  == 9'u64
 
-  test "Divmod(2^64, 3) returns the correct result":
-    let a = 1.stuint(128) shl 64
-    let b = 3.stuint(128)
-
-    let qr = divmod(a, b)
-
-    let q = cast[UintImpl[uint64]](qr.quot)
-    let r = cast[UintImpl[uint64]](qr.rem)
-
-    check: q.lo == 6148914691236517205'u64
-    check: q.hi == 0'u64
-    check: r.lo == 1'u64
-    check: r.hi == 0'u64
+  # TODO - no more .lo / .hi
+  #
+  # test "Divmod(2^64, 3) returns the correct result":
+  #   let a = 1.stuint(128) shl 64
+  #   let b = 3.stuint(128)
+  #
+  #   let qr = divmod(a, b)
+  #
+  #   let q = cast[UintImpl[uint64]](qr.quot)
+  #   let r = cast[UintImpl[uint64]](qr.rem)
+  #
+  #   check: q.lo == 6148914691236517205'u64
+  #   check: q.hi == 0'u64
+  #   check: r.lo == 1'u64
+  #   check: r.hi == 0'u64
 
   test "Divmod(1234567891234567890, 10) returns the correct result":
     let a = cast[StUint[64]](1234567891234567890'u64)
@@ -312,3 +244,4 @@ suite "Testing specific failures highlighted by property-based testing":
     let tz = cast[uint64](a mod b)
 
     check: z == tz
+]#
\ No newline at end of file
diff --git a/tests/test_uint_endians2.nim b/tests/test_uint_endians2.nim
index f8e19a6..d9acc1f 100644
--- a/tests/test_uint_endians2.nim
+++ b/tests/test_uint_endians2.nim
@@ -9,6 +9,7 @@
 
 import ../stint, unittest, stew/byteutils, test_helpers
 
+
 template chkSwapBytes(chk: untyped, bits: int, hex: string) =
   # dumpHex already do the job to swap the output if
   # we use `littleEndian` on both platform
@@ -35,7 +36,7 @@ template chkFromBytes(chk: untyped, bits: int, hex: string) =
 
 template chkFromBytesBE(chk: untyped, bits: int, hex: string) =
   let x = fromHex(StUint[bits], hex)
-  let z = fromBytesBE(StUint[bits], toBytesBE(x))
+  let z = fromBytesBE(StUint[bits], toByteArrayBE(x))
   chk z == x
 
 template chkFromBytesLE(chk: untyped, bits: int, hex: string) =
@@ -50,28 +51,28 @@ template chkFromToLE(chk: untyped, bits: int, hex: string) =
 
 template chkFromToBE(chk: untyped, bits: int, hex: string) =
   let x = fromHex(StUint[bits], hex)
-  let z = x.fromBE.toBE
+  let z = x.fromBytesBE.toByteArrayBE
   chk z == x
 
 template chkEndians(chkFunc, tst, name: untyped) =
   tst astToStr(name).substr(3):
-    name(chkFunc, 8, "ab")
-    name(chkFunc, 16, "abcd")
-    name(chkFunc, 32, "abcdef12")
-    name(chkFunc, 64, "abcdef1234567890")
+    #name(chkFunc, 8, "ab")
+    #name(chkFunc, 16, "abcd")
+    #name(chkFunc, 32, "abcdef12")
+    #name(chkFunc, 64, "abcdef1234567890")
     name(chkFunc, 128, "abcdef1234567890abcdef1234567890")
     name(chkFunc, 256, "abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890")
 
 template testEndians(chkFunc, tst: untyped) =
-  chkEndians(chkFunc, tst, chkSwapBytes)
-  chkEndians(chkFunc, tst, chkToBytes)
-  chkEndians(chkFunc, tst, chkToBytesLE)
+  #chkEndians(chkFunc, tst, chkSwapBytes)
+  #chkEndians(chkFunc, tst, chkToBytes)
+  #chkEndians(chkFunc, tst, chkToBytesLE)
   chkEndians(chkFunc, tst, chkToBytesBE)
-  chkEndians(chkFunc, tst, chkFromBytes)
-  chkEndians(chkFunc, tst, chkFromBytesLE)
-  chkEndians(chkFunc, tst, chkFromBytesBE)
-  chkEndians(chkFunc, tst, chkFromToLE)
-  chkEndians(chkFunc, tst, chkFromToBE)
+  #chkEndians(chkFunc, tst, chkFromBytes)
+  #chkEndians(chkFunc, tst, chkFromBytesLE)
+  #chkEndians(chkFunc, tst, chkFromBytesBE)
+  #chkEndians(chkFunc, tst, chkFromToLE)
+  #chkEndians(chkFunc, tst, chkFromToBE)
 
 static:
   testEndians(ctCheck, ctTest)
@@ -80,16 +81,16 @@ suite "Testing endians":
   test "Endians give sane results":
 
     check:
-      1.u128.toBytesBE() ==
+      1.u128.toByteArrayBE() ==
         [0'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
 
-      1.u128.toBytesLE() ==
-        [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+      #1.u128.toBytesLE() ==
+      #  [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 
       1.u128 == UInt128.fromBytesBE(
         [0'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
 
-      1.u128 == UInt128.fromBytesLE(
-        [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+      #1.u128 == UInt128.fromBytesLE(
+      #  [1'u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 
   testEndians(check, test)
diff --git a/tests/test_uint_exp.nim b/tests/test_uint_exp.nim
index 93b61cd..44fdd49 100644
--- a/tests/test_uint_exp.nim
+++ b/tests/test_uint_exp.nim
@@ -17,7 +17,7 @@ template chkPow(chk: untyped, a: string, b: SomeInteger, c: string, bits: int) =
 
 template testExp(chk, tst: untyped) =
   tst "BigInt BigInt Pow":
-    chkPow(chk, "F", "2", "E1", 8)
+    #[chkPow(chk, "F", "2", "E1", 8)
 
     chkPow(chk, "F", "2", "E1", 16)
     chkPow(chk, "FF", "2", "FE01", 16)
@@ -29,7 +29,7 @@ template testExp(chk, tst: untyped) =
     chkPow(chk, "F", "2", "E1", 64)
     chkPow(chk, "FF", "2", "FE01", 64)
     chkPow(chk, "FF", "3", "FD02FF", 64)
-    chkPow(chk, "FFF", "3", "FFD002FFF", 64)
+    chkPow(chk, "FFF", "3", "FFD002FFF", 64)]#
 
     chkPow(chk, "F", "2", "E1", 128)
     chkPow(chk, "FF", "2", "FE01", 128)
@@ -38,7 +38,7 @@ template testExp(chk, tst: untyped) =
     chkPow(chk, "FFFFF", "3", "ffffd00002fffff", 128)
 
   tst "BigInt Natural Pow":
-    chkPow(chk, "F", 2, "E1", 8)
+    #[chkPow(chk, "F", 2, "E1", 8)
 
     chkPow(chk, "F", 2, "E1", 16)
     chkPow(chk, "FF", 2, "FE01", 16)
@@ -50,7 +50,7 @@ template testExp(chk, tst: untyped) =
     chkPow(chk, "F", 2, "E1", 64)
     chkPow(chk, "FF", 2, "FE01", 64)
     chkPow(chk, "FF", 3, "FD02FF", 64)
-    chkPow(chk, "FFF", 3, "FFD002FFF", 64)
+    chkPow(chk, "FFF", 3, "FFD002FFF", 64)]#
 
     chkPow(chk, "F", 2, "E1", 128)
     chkPow(chk, "FF", 2, "FE01", 128)
@@ -64,6 +64,7 @@ static:
 suite "Wider unsigned int exp coverage":
   testExp(check, test)
 
+#[
 suite "Testing unsigned exponentiation":
   test "Simple exponentiation 5^3":
 
@@ -84,3 +85,4 @@ suite "Testing unsigned exponentiation":
 
     check: a.pow(b) == "4922235242952026704037113243122008064".u256
     check: a.pow(b.stuint(256)) == "4922235242952026704037113243122008064".u256
+]#
\ No newline at end of file
diff --git a/tests/test_uint_modular_arithmetic.nim b/tests/test_uint_modular_arithmetic.nim
index 6b3ad19..db5842f 100644
--- a/tests/test_uint_modular_arithmetic.nim
+++ b/tests/test_uint_modular_arithmetic.nim
@@ -23,7 +23,7 @@ template chkPowMod(chk: untyped, a, b, m, c: string, bits: int) =
 
 template testModArith(chk, tst: untyped) =
   tst "addmod":
-    chkAddMod(chk, "F", "F", "7", "2", 8)
+    #[chkAddMod(chk, "F", "F", "7", "2", 8)
     chkAddMod(chk, "AAAA", "AA", "F", "0", 16)
     chkAddMod(chk, "BBBB", "AAAA", "9", "3", 16)
 
@@ -36,7 +36,7 @@ template testModArith(chk, tst: untyped) =
     chkAddMod(chk, "AAAA", "AA", "F", "0", 64)
     chkAddMod(chk, "BBBB", "AAAA", "9", "3", 64)
     chkAddMod(chk, "BBBBBBBB", "AAAAAAAA", "9", "6", 64)
-    chkAddMod(chk, "BBBBBBBBBBBBBBBB", "AAAAAAAAAAAAAAAA", "9", "3", 64)
+    chkAddMod(chk, "BBBBBBBBBBBBBBBB", "AAAAAAAAAAAAAAAA", "9", "3", 64)]#
 
     chkAddMod(chk, "F", "F", "7", "2", 128)
     chkAddMod(chk, "AAAA", "AA", "F", "0", 128)
@@ -47,7 +47,7 @@ template testModArith(chk, tst: untyped) =
 
 
   tst "submod":
-    chkSubMod(chk, "C", "3", "C", "9", 8)
+    #[chkSubMod(chk, "C", "3", "C", "9", 8)
     chkSubMod(chk, "1", "3", "C", "A", 8)
     chkSubMod(chk, "1", "FF", "C", "A", 8)
 
@@ -64,7 +64,7 @@ template testModArith(chk, tst: untyped) =
     chkSubMod(chk, "1", "3", "C", "A", 64)
     chkSubMod(chk, "1", "FFFF", "C", "A", 64)
     chkSubMod(chk, "1", "FFFFFFFF", "C", "A", 64)
-    chkSubMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "A", 64)
+    chkSubMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "A", 64)]#
 
     chkSubMod(chk, "C", "3", "C", "9", 128)
     chkSubMod(chk, "1", "3", "C", "A", 128)
@@ -74,7 +74,7 @@ template testModArith(chk, tst: untyped) =
     chkSubMod(chk, "1", "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", "C", "A", 128)
 
   tst "mulmod":
-    chkMulMod(chk, "C", "3", "C", "0", 8)
+    #[chkMulMod(chk, "C", "3", "C", "0", 8)
     chkMulMod(chk, "1", "3", "C", "3", 8)
     chkMulMod(chk, "1", "FF", "C", "3", 8)
 
@@ -91,7 +91,7 @@ template testModArith(chk, tst: untyped) =
     chkMulMod(chk, "1", "3", "C", "3", 64)
     chkMulMod(chk, "1", "FFFF", "C", "3", 64)
     chkMulMod(chk, "1", "FFFFFFFF", "C", "3", 64)
-    chkMulMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "3", 64)
+    chkMulMod(chk, "1", "FFFFFFFFFFFFFFFF", "C", "3", 64)]#
 
     chkMulMod(chk, "C", "3", "C", "0", 128)
     chkMulMod(chk, "1", "3", "C", "3", 128)
@@ -106,7 +106,7 @@ template testModArith(chk, tst: untyped) =
     discard
   else:
     tst "powmod":
-      chkPowMod(chk, "C", "3", "C", "0", 8)
+      #[chkPowMod(chk, "C", "3", "C", "0", 8)
       chkPowMod(chk, "1", "3", "C", "1", 8)
       chkPowMod(chk, "1", "FF", "C", "1", 8)
       chkPowMod(chk, "FF", "3", "C", "3", 8)
@@ -130,7 +130,7 @@ template testModArith(chk, tst: untyped) =
       chkPowMod(chk, "FF", "3", "C", "3", 64)
       chkPowMod(chk, "FFFF", "3", "C", "3", 64)
       chkPowMod(chk, "FFFFFFFF", "3", "C", "3", 64)
-      chkPowMod(chk, "FFFFFFFFFFFFFFFF", "3", "C", "3", 64)
+      chkPowMod(chk, "FFFFFFFFFFFFFFFF", "3", "C", "3", 64)]#
 
       chkPowMod(chk, "C", "3", "C", "0", 128)
       chkPowMod(chk, "1", "3", "C", "1", 128)
@@ -147,6 +147,7 @@ static:
 suite "Wider unsigned Modular arithmetic coverage":
   testModArith(check, test)
 
+#[
 suite "Modular arithmetic":
   test "Modular addition":
 
@@ -202,3 +203,4 @@ suite "Modular arithmetic":
 
       check:
         powmod(P, Q, M) == expected
+]#
diff --git a/tests/test_uint_mul.nim b/tests/test_uint_mul.nim
new file mode 100644
index 0000000..1d08167
--- /dev/null
+++ b/tests/test_uint_mul.nim
@@ -0,0 +1,90 @@
+# Stint
+# Copyright 2018 Status Research & Development GmbH
+# Licensed under either of
+#
+#  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
+#  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+#
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import ../stint, unittest, test_helpers
+
+template chkMul(chk: untyped, a, b, c: string, bits: int) =
+  chk (fromHex(StUint[bits], a) * fromHex(StUint[bits], b)) == fromHex(StUint[bits], c)
+
+template testMul(chk, tst: untyped) =
+  tst "operator `mul`":
+    #[chkMul(chk, "0", "3", "0", 8)
+    chkMul(chk, "1", "3", "3", 8)
+    chkMul(chk, "64", "3", "2C", 8) # overflow
+
+    chkMul(chk, "0", "3", "0", 16)
+    chkMul(chk, "1", "3", "3", 16)
+    chkMul(chk, "64", "3", "12C", 16)
+    chkMul(chk, "1770", "46", "68A0", 16) # overflow
+
+    chkMul(chk, "0", "3", "0", 32)
+    chkMul(chk, "1", "3", "3", 32)
+    chkMul(chk, "64", "3", "12C", 32)
+    chkMul(chk, "1770", "46", "668A0", 32)
+    chkMul(chk, "13880", "13880", "7D784000", 32) # overflow
+
+    chkMul(chk, "0", "3", "0", 64)
+    chkMul(chk, "1", "3", "3", 64)
+    chkMul(chk, "64", "3", "12C", 64)
+    chkMul(chk, "1770", "46", "668A0", 64)
+    chkMul(chk, "13880", "13880", "17D784000", 64)
+    chkMul(chk, "3B9ACA00", "E8D4A51000", "35C9ADC5DEA00000", 64) # overflow]#
+
+    chkMul(chk, "0", "3", "0", 128)
+    chkMul(chk, "1", "3", "3", 128)
+    chkMul(chk, "64", "3", "12C", 128)
+    chkMul(chk, "1770", "46", "668A0", 128)
+    chkMul(chk, "13880", "13880", "17D784000", 128)
+    chkMul(chk, "3B9ACA00", "E8D4A51000", "3635C9ADC5DEA00000", 128)
+    chkMul(chk, "25295F0D1", "10", "25295F0D10", 128)
+    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "4b66dc33f6acdca5e20890f2a5210000", 128) # overflow
+
+    chkMul(chk, "123456789ABCDEF00", "123456789ABCDEF00", "14b66dc33f6acdca5e20890f2a5210000", 256)
+
+static:
+  testMul(ctCheck, ctTest)
+
+suite "Wider unsigned int muldiv coverage":
+  testMul(check, test)
+
+#[
+suite "Testing unsigned int multiplication implementation":
+  test "Multiplication with result fitting in low half":
+
+    let a = 10000.stuint(64)
+    let b = 10000.stuint(64)
+
+    check: cast[uint64](a*b) == 100_000_000'u64 # need 27-bits
+
+  test "Multiplication with result overflowing low half":
+
+    let a = 1_000_000.stuint(64)
+    let b = 1_000_000.stuint(64)
+
+    check: cast[uint64](a*b) == 1_000_000_000_000'u64 # need 40 bits
+
+  test "Full overflow is handled like native unsigned types":
+
+    let a = 1_000_000_000.stuint(64)
+    let b = 1_000_000_000.stuint(64)
+    let c = 1_000.stuint(64)
+
+    let x = 1_000_000_000'u64
+    let y = 1_000_000_000'u64
+    let z = 1_000'u64
+    let w = x*y*z
+
+    #check: cast[uint64](a*b*c) == 1_000_000_000_000_000_000_000'u64 # need 70-bits
+    check: cast[uint64](a*b*c) == w
+
+  test "Nim v1.0.2 32 bit type inference rule changed":
+    let x = 9975492817.stuint(256)
+    let y = 16.stuint(256)
+    check x * y == 159607885072.stuint(256)
+]#
\ No newline at end of file