rust-num
diff --git a/‎src/biguint/convert.rs‎
Lines changed: 10 additions & 4 deletions b/‎src/biguint/convert.rs‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/biguint/division.rs‎
Lines changed: 34 additions & 3 deletions b/‎src/biguint/division.rs‎
Lines changed: 34 additions & 3 deletions
@@ -4,7 +4,7 @@
 use super::{biguint_from_vec, BigUint, ToBigUint};
 
 use super::addition::add2;
-use super::division::div_rem_digit;
+use super::division::{div_rem_digit, FAST_DIV_WIDE};
 use super::multiplication::mac_with_carry;
 
 use crate::big_digit::{self, BigDigit};
@@ -688,16 +688,22 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
 
  let mut digits = u.clone();
 
- let (base, power) = get_half_radix_base(radix);
+ // X86 DIV can quickly divide by a full digit, otherwise we choose a divisor
+ // that's suitable for `div_half` to avoid slow `DoubleBigDigit` division.
+ let (base, power) = if FAST_DIV_WIDE {
+ get_radix_base(radix)
+ } else {
+ get_half_radix_base(radix)
+ };
  let radix = radix as BigDigit;
 
  // For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the
  // performance. We can mitigate this by dividing into chunks of a larger base first.
  // The threshold for this was chosen by anecdotal performance measurements to
  // approximate where this starts to make a noticeable difference.
  if digits.data.len() >= 64 {
- let mut big_base = BigUint::from(base * base);
- let mut big_power = 2usize;
+ let mut big_base = BigUint::from(base);
+ let mut big_power = 1usize;
 
  // Choose a target base length near √n.
  let target_len = digits.data.len().sqrt();
 
@@ -10,12 +10,15 @@ use core::ops::{Div, DivAssign, Rem, RemAssign};
 use num_integer::Integer;
 use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};
 
+pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
+
 /// Divide a two digit numerator by a one digit divisor, returns quotient and remainder:
 ///
 /// Note: the caller must ensure that both the quotient and remainder will fit into a single digit.
 /// This is _not_ true for an arbitrary numerator/denominator.
 ///
 /// (This function also matches what the x86 divide instruction does).
+#[cfg(any(miri, not(any(target_arch = "x86", target_arch = "x86_64"))))]
 #[inline]
 fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
  debug_assert!(hi < divisor);
@@ -25,6 +28,34 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
  ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
 }
 
+/// x86 and x86_64 can use a real `div` instruction.
+#[cfg(all(not(miri), any(target_arch = "x86", target_arch = "x86_64")))]
+#[inline]
+fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
+ // This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one
+ // register, otherwise in release mode it will become a target-specific fault like SIGFPE.
+ // This should never occur with the inputs from our few `div_wide` callers.
+ debug_assert!(hi < divisor);
+
+ // SAFETY: The `div` instruction only affects registers, reading the explicit operand as the
+ // divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly
+ // written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is
+ // used, and flags are not preserved.
+ unsafe {
+ let (div, rem);
+
+ core::arch::asm!(
+ "div {}",
+ in(reg) divisor,
+ inout("dx") hi => rem,
+ inout("ax") lo => div,
+ options(pure, nomem, nostack),
+ );
+
+ (div, rem)
+ }
+}
+
 /// For small divisors, we can divide without promoting to `DoubleBigDigit` by
 /// using half-size pieces of digit, like long-division.
 #[inline]
@@ -45,7 +76,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)
 
  let mut rem = 0;
 
- if b <= big_digit::HALF {
+ if !FAST_DIV_WIDE && b <= big_digit::HALF {
  for d in a.data.iter_mut().rev() {
  let (q, r) = div_half(rem, *d, b);
  *d = q;
@@ -70,7 +101,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
 
  let mut rem = 0;
 
- if b <= big_digit::HALF {
+ if !FAST_DIV_WIDE && b <= big_digit::HALF {
  for &digit in a.data.iter().rev() {
  let (_, r) = div_half(rem, digit, b);
  rem = r;
@@ -230,7 +261,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
  let mut a0 = 0;
 
  // [b1, b0] are the two most significant digits of the divisor. They never change.
- let b0 = *b.last().unwrap();
+ let b0 = b[b.len() - 1];
  let b1 = b[b.len() - 2];
 
  let q_len = a.data.len() - b.len() + 1;