Diskuze: MOD velkého unsigned integer čísla

V předchozím kvízu, Online test znalostí C++, jsme si ověřili nabyté zkušenosti z kurzu.

Aktivity

Caster

Člen

Caster:1.1.2025 12:43

Pokouším se vypočítat MOD tj. zbytek po dělení velkého unsigned čísla ale stále mi výsledek, který jsem si spočítal na online kalkulačce velkých čísel nesedí (správný výsledek je uveden v programu). Předpokládám, že by mohl být problém v operacích s uint128_t proměnnými i když jsem pomocí krokování při ladění programu ověřil, že by to mělo být v pořádku.

Funkce MOD byla generována pomocí openai o1 preview free viz odkaz ke stažení htm souboru. Po jeho otevření je nutno kliknout na šedý posuvník u pravého okraje a posunout ho úplně dolů.

Zkusil jsem: Vlastní zkušební program:

// AVX2_MOD_Test.cpp : Tento soubor obsahuje funkci main. Provádění programu se tam zahajuje a ukončuje.
//

#include <iostream>
#include <immintrin.h>  // For AVX2
#include <cstring>      // For memcpy and memset
#include <cstdint>      // For uint64_t
#include <windows.h>    // For VirtualAlloc and VirtualFree
#include <algorithm>    // std::max
#include <cstddef>      // for size_t

// https://stackoverflow.com/questions/6759592/how-to-enable-int128-on-visual-studio
#define _CRT_SECURE_NO_WARNINGS
#include <__msvc_int128.hpp>
#include "AVX2_MOD_Test.h"

using namespace std;
using uint128_t = _Unsigned128;

// Constants and definitions
constexpr size_t GB = 1;
constexpr size_t giga = 1024 * 1024 * 1024;
//size_t num_bits = 136279841;
size_t num_bits = 521;    // 13. 2^521-1, 157 cifer
//size_t num_bits = 3;
size_t num_uint64 = ((num_bits + 63) / 64); // Number of uint64_t elements

void big_integer_shift_left_in_place(uint64_t* x, size_t len, size_t shift_bits) {
    if (shift_bits == 0) return;

    size_t shift_words = shift_bits / 64;
    shift_bits = shift_bits % 64;

    if (shift_words > 0) {
        // Shift words
        for (size_t i = len; i-- > shift_words;) {
            x[i] = x[i - shift_words];
        }
        // Zero out lower words
        for (size_t i = 0; i < shift_words; ++i) {
            x[i] = 0;
        }
    }

    if (shift_bits > 0) {
        // Shift bits within words
        uint64_t carry = 0;
        for (size_t i = 0; i < len; ++i) {
            uint64_t temp = x[i];
            x[i] = (temp << shift_bits) | carry;
            carry = temp >> (64 - shift_bits);
        }
    }
}

void big_integer_shift_right_in_place(uint64_t* x, size_t len, size_t shift_bits) {
    if (shift_bits == 0) return;

    size_t shift_words = shift_bits / 64;
    shift_bits = shift_bits % 64;

    if (shift_words > 0) {
        // Shift words
        for (size_t i = 0; i < len - shift_words; ++i) {
            x[i] = x[i + shift_words];
        }
        // Zero out upper words
        for (size_t i = len - shift_words; i < len; ++i) {
            x[i] = 0;
        }
    }

    if (shift_bits > 0) {
        // Shift bits within words
        uint64_t carry = 0;
        for (size_t i = len; i-- > 0;) {
            uint64_t temp = x[i];
            x[i] = (temp >> shift_bits) | carry;
            carry = temp << (64 - shift_bits);
        }
    }
}

// Function to compute the modulus of two large numbers
void mod(uint64_t* dst, uint64_t* src, uint64_t* tmp, size_t dst_len, size_t src_len) {
    if (dst_len < src_len) {
        // The modulus is dst itself
        return;
    }

    // Initialize variables
    size_t n = dst_len;
    size_t k = src_len;

    // Copy dst to remainder, ensure enough space for possible carry
    uint64_t* u = tmp; // Length n + 1
    memset(u, 0, (n + 1) * sizeof(uint64_t));
    memcpy(&u[1], dst, n * sizeof(uint64_t));

    // Copy src to divisor
    uint64_t* v = tmp + n + 1; // Length k
    memcpy(v, src, k * sizeof(uint64_t));

    // Normalize
    unsigned long shift;
    _BitScanReverse64(&shift, v[0]);
    shift = 63 - shift;
    if (shift > 0) {
        big_integer_shift_left_in_place(v, k, shift);
        big_integer_shift_left_in_place(u, n + 1, shift);
    }

    uint64_t* q = tmp + n + 1 + k; // Length n - k + 1
    memset(q, 0, (n - k + 1) * sizeof(uint64_t));

    uint64_t v1 = v[0];
    uint64_t v2 = (k > 1) ? v[1] : 0;

    for (ptrdiff_t j = 0; j <= (ptrdiff_t)(n - k); ++j) {
        // Calculate q_hat and r_hat
        uint64_t uj = u[j];
        uint64_t uj1 = u[j + 1];
        uint64_t uj2 = (j + 2 <= n) ? u[j + 2] : 0;

        uint64_t q_hat;
        uint64_t r_hat;
        q_hat = _udiv128(uj, uj1, v1, &r_hat);

        // Adjust q_hat
        // Perform the comparison using 128-bit arithmetic: q_hat * v2 > r_hat * b + u_j+2
        uint128_t lhs = (uint128_t)q_hat * (uint128_t)v2;
        uint128_t rhs = ((uint128_t)r_hat << 64) + (uint128_t)uj2;

        while (lhs > rhs) {
            q_hat--;
            r_hat += v1;
            if (r_hat < v1) {
                // Overflow occurred, stop adjusting
                break;
            }
            lhs -= v2;
            rhs += ((uint128_t)v1 << 64);
        }

        // Multiply and subtract q_hat * v from u
        uint8_t borrow = 0;
        uint64_t carry = 0;
        for (size_t i = 0; i < k; ++i) {
            uint64_t prod_high, prod_low;
            prod_low = _umul128(q_hat, v[i], &prod_high);

            uint64_t temp;
            borrow = _subborrow_u64(borrow, u[j + i], prod_low + carry, &temp);
            u[j + i] = temp;
            carry = prod_high + (borrow ? 1 : 0);
            borrow = 0; // Reset borrow for the next iteration
        }
        borrow = _subborrow_u64(borrow, u[j + k], carry, &u[j + k]);

        // If we had a borrow, correct
        if (borrow) {
            q_hat--;
            uint8_t carry2 = 0;
            for (size_t i = 0; i < k; ++i) {
                uint64_t temp;
                carry2 = _addcarry_u64(carry2, u[j + i], v[i], &temp);
                u[j + i] = temp;
            }
            _addcarry_u64(carry2, u[j + k], 0, &u[j + k]);
        }

        q[j] = q_hat;
    }

    // Denormalize
    if (shift > 0) {
        big_integer_shift_right_in_place(u, k + 1, shift);
    }

    // The modulus is in u[1..k]
    memcpy(dst, &u[1], k * sizeof(uint64_t));
    memset(dst + k, 0, (dst_len - k) * sizeof(uint64_t));
}


int main() {
    // Allocate 1GB memory
    static const size_t giga = 1024 * 1024 * 1024;
    static const size_t size = GB * giga;
    uint64_t* POLE = static_cast<uint64_t*>(VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE));

    if (!POLE) {
        std::cerr << "Memory allocation failed" << std::endl;
        return -1;
    }

    size_t dst_len = 16;
    size_t src_len = 9;

    // Initialize tmp
    uint64_t* dst = POLE;
    uint64_t* src = dst + dst_len;
    uint64_t* expected_modulus = src + src_len;
    uint64_t* tmp = expected_modulus + src_len;

    // Číslo

        uint64_t dst_init[16] = { 0xe180d8077d300002, 0x7b625008a4fdb542, 0xe9265e2d44f0d9cf, 0x86c153dde116ab13,
                              0x1c432374cb51e3be, 0x37109f09c0bb25e3, 0x4e4e0317cc2e7e96, 0xe7fb642df0da0ec7,
                              0xa073e694d0335847, 0x6f531c9bc39a4535, 0xaa72054f943acfd8, 0x3ee329ad967df484,
                              0x7b6c1bf56a1b4541, 0x66329dc0cbc5bcf8, 0x96780f8534f0ba1d, 0x0000000000001b8c };

        memcpy(dst, dst_init, dst_len * sizeof(uint64_t));

        // 2^521 - 1
        uint64_t src_init[9] = { 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
                                             0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff,
                                             0x00000000000001ff };

        memcpy(src, src_init, src_len * sizeof(uint64_t));


        // Očekávaný výsledek MOD
        uint64_t expected_modulus_init[9] = { 0x6529f3aed3debafd, 0x55ac347acd93cac2, 0xa0d6b8378fae5bd5, 0x6b43594fb2429f10,
                                                          0x8ff4c08cf314a0e6, 0x0e2e2701d6abd260, 0x4768fb7a28b4690b, 0xf16dd49eee538fc1,
                                                          0x0000000000000103 };

        memcpy(expected_modulus, expected_modulus_init, src_len * sizeof(uint64_t));


    // Call mod function
    mod(dst, src, tmp, dst_len, src_len);

    // Verify the result
    if (memcmp(dst, expected_modulus, src_len * sizeof(uint64_t)) == 0) {
        printf("MOD function works correctly.\n");
    }
    else {
        printf("MOD function failed.\n");
    }

    // Free allocated memory
    VirtualFree(POLE, 0, MEM_RELEASE);

    return 0;
}

Chci docílit: Aby program spočítal správný výsledek.

Odpovědět

1.1.2025 12:43

JerryM

Člen

JerryM:8.1.2025 12:28

popravdě máš to nějaký divný
přešti si tohle:
https://stackoverflow.com/…arge-numbers
a knížku
Discrete Mathematics and Its Applications, Kenneth H. Rosen
už má 8. vydání.
a taky tady
https://pdfcoffee.com/…df-free.html
https://stackoverflow.com/…th-libraries

Nahoru Odpovědět

8.1.2025 12:28

JerryM

Člen

JerryM:8.1.2025 12:47

https://stackoverflow.com/…isual-studio

Nahoru Odpovědět

8.1.2025 12:47

Caster

Člen

Caster:8.1.2025 20:59

Jak použít proměnnou typu uint128_t ve Visual Studiu vím. Ke správnému výsledku mi to ale nepomohlo. Nyní radši používám svou externí funkci na 128bitové dělení v x64 asm.

// https://stackoverflow.com/questions/6759592/how-to-enable-int128-on-visual-studio
#define _CRT_SECURE_NO_WARNINGS
#include <__msvc_int128.hpp>
#include "AVX2_MOD_Test.h"

using namespace std;
using uint128_t = _Unsigned128;

Používám externí funkci v x64 asm 128÷128-bit Unsigned Integer Division (128-bit Quotient and Remainder) kterou volám z C++.

Aktuální verze MOD, která ale stále nefunguje.

#include <stdint.h>      // For uint64_t, uint8_t
#include <stddef.h>      // For size_t
#include <string.h>      // For memcpy, memset
#include <stdio.h>       // For printf
#include <intrin.h>      // For _umul128, _addcarry_u64, _subborrow_u64, _BitScanReverse64
#include <windows.h>     // For VirtualAlloc, VirtualFree

// Define uint128_t type using struct
typedef struct {
    uint64_t lo;
    uint64_t hi;
} uint128_t;

uint8_t CarryFlag = 0;  // Crucial: Initialize CarryFlag

// Function to compare two uint128_t values
int compare_uint128(uint128_t a, uint128_t b) {
    if (a.hi < b.hi) return -1;
    if (a.hi > b.hi) return 1;
    if (a.lo < b.lo) return -1;
    if (a.lo > b.lo) return 1;
    return 0;
}

// External udiv128 function (your implementation)
extern "C" void udiv128(uint128_t* q, uint128_t* dividend, uint128_t* divisor, uint128_t* r);

// Shift left by `shift` bits in place
void big_integer_shift_left_in_place(uint64_t* a, size_t len, unsigned long shift_bits) {
    if (shift_bits == 0 || len == 0) return;

    size_t word_shift = shift_bits / 64;
    unsigned long bit_shift = shift_bits % 64;

    if (word_shift >= len) {
        // All bits are shifted out
        memset(a, 0, len * sizeof(uint64_t));
        return;
    }

    // Shift words
    if (word_shift > 0) {
        for (size_t i = len; i-- > word_shift;) {
            a[i] = a[i - word_shift];
        }
        // Zero fill lower words
        memset(a, 0, word_shift * sizeof(uint64_t));
    }

    // Shift bits
    if (bit_shift > 0) {
        uint64_t carry = 0;
        for (size_t i = word_shift; i < len; ++i) {
            uint64_t temp = a[i];
            a[i] = (temp << bit_shift) | carry;
            carry = temp >> (64 - bit_shift);
        }
    }
}

// Shift right by `shift` bits in place
void big_integer_shift_right_in_place(uint64_t* a, size_t len, unsigned long shift) {
    if (shift == 0) return;

    size_t word_shift = shift / 64;
    shift %= 64;

    if (word_shift >= len) {
        memset(a, 0, len * sizeof(uint64_t));
        return;
    }

    if (word_shift > 0) {
        for (size_t i = 0; i + word_shift < len; ++i) {
            a[i] = a[i + word_shift];
        }
        memset(a + len - word_shift, 0, word_shift * sizeof(uint64_t));
    }

    if (shift > 0) {
        uint64_t carry = 0;
        for (size_t i = len; i-- > 0;) {
            uint64_t temp = a[i];
            a[i] = (temp >> shift) | carry;
            carry = temp << (64 - shift);
        }
    }
}


// Corrected mod function with detailed debug printouts
void mod(uint64_t* dst, uint64_t* src, uint64_t* tmp, size_t dst_len, size_t src_len) {
    if (dst_len < src_len) {
        // The modulus is dst itself
        return;
    }

    size_t n = dst_len;
    size_t k = src_len;

    // Initialize u
    size_t u_len = n + k + 1;
    uint64_t* u = tmp; // Length u_len
    memset(u, 0, u_len * sizeof(uint64_t));
    memcpy(u, dst, n * sizeof(uint64_t)); // Copy dst into u[0..n-1]

    // Initialize v
    uint64_t* v = tmp + u_len; // Length k
    memcpy(v, src, k * sizeof(uint64_t));

    // Normalize
    unsigned long shift;
    if (_BitScanReverse64(&shift, v[0])) {
        shift = 63 - shift;
    }
    else {
        printf("Error: Divisor is zero.\n");
        return;
    }

    if (shift > 0) {
        big_integer_shift_left_in_place(v, k, shift);
        big_integer_shift_left_in_place(u, u_len, shift); // u has length u_len
    }

    // Debug: Print normalized v
    printf("Normalized v (k = %zu):\n", k);
    for (size_t i = 0; i < k; ++i) {
        printf("%016llx ", v[i]);
    }
    printf("\n");

    // Debug: Print normalized u
    printf("Normalized u (u_len = %zu):\n", u_len);
    for (size_t i = 0; i < u_len; ++i) {
        printf("%016llx ", u[i]);
    }
    printf("\n");

    // Initialize q
    uint64_t* q = tmp + u_len + k; // Length n - k + 1
    memset(q, 0, (n - k + 1) * sizeof(uint64_t));

    uint64_t v1 = v[0];
    uint64_t v2 = (k > 1) ? v[1] : 0;

    // Main loop
    for (size_t j = 0; j <= n - k; ++j) {
        // Construct the 128-bit dividend
        uint128_t dividend;
        dividend.hi = u[j];
        dividend.lo = u[j + 1];

        // Construct the 128-bit divisor
        uint128_t divisor;
        divisor.hi = 0;
        divisor.lo = v1;

        // Variables for quotient and remainder
        uint128_t q_hat_struct;
        uint128_t r_hat_struct;

        // Use external udiv128 function
        udiv128(&q_hat_struct, &dividend, &divisor, &r_hat_struct);

        uint64_t q_hat = q_hat_struct.lo;
        uint64_t r_hat = r_hat_struct.lo;

        // Next word in u
        uint64_t uj2 = (j + 2 < u_len) ? u[j + 2] : 0;

        // Debug printout
        printf("\nStep %zu:\n", j);
        printf("  dividend.hi=%016llx dividend.lo=%016llx\n", dividend.hi, dividend.lo);
        printf("  divisor.hi=%016llx divisor.lo=%016llx\n", divisor.hi, divisor.lo);
        printf("  Initial q_hat=%016llx r_hat=%016llx\n", q_hat, r_hat);
        printf("  uj2=%016llx\n", uj2);

        /// Adjust q_hat if necessary
        if (k > 1) {
            while (true) {
                uint64_t lhs_hi, lhs_lo;
                lhs_lo = _umul128(q_hat, v2, &lhs_hi);

                // Corrected lhs and rhs
                uint128_t lhs = { lhs_hi, lhs_lo };
                uint128_t rhs = { r_hat, uj2 };

                // Debug: Print comparison values
                printf("    Adjusting q_hat:\n");
                printf("      lhs=%016llx%016llx\n", lhs.hi, lhs.lo);
                printf("      rhs=%016llx%016llx\n", rhs.hi, rhs.lo);

                if (compare_uint128(lhs, rhs) <= 0) {
                    // If lhs <= rhs, stop adjusting
                    break;
                }

                q_hat--;
                CarryFlag = _addcarry_u64(CarryFlag, r_hat, v1, &r_hat);
                if (CarryFlag) {
                    // If overflow occurs, cannot adjust further
                    break;
                }

                // Debug: Print adjusted q_hat and r_hat
                printf("    q_hat decremented to %016llx, r_hat adjusted to %016llx\n", q_hat, r_hat);
            }
        }


        // Debug printout after adjustment
        printf("  Adjusted q_hat=%016llx\n", q_hat);

        // Multiply and subtract q_hat * v from u
        uint8_t borrow = 0;
        uint64_t carry = 0;
        for (size_t i = k; i-- > 0;) {
            uint64_t prod_hi, prod_lo;
            prod_lo = _umul128(q_hat, v[i], &prod_hi);

            // Add previous carry to prod_lo
            uint64_t sum_lo;
            uint8_t carry_in = _addcarry_u64(0, prod_lo, carry, &sum_lo);

            // Update carry
            uint64_t sum_hi = prod_hi + carry_in;

            // Subtract sum_lo from u[j + i]
            uint64_t temp;
            borrow = _subborrow_u64(borrow, u[j + i], sum_lo, &temp);
            u[j + i] = temp;

            carry = sum_hi;
        }
        borrow = _subborrow_u64(borrow, u[j + k], carry, &u[j + k]);

        // Debug printout after subtraction
        printf("  After subtraction borrow=%u\n", borrow);
        printf("  u[%zu..%zu] = ", j, j + k);
        for (size_t idx = j; idx <= j + k; ++idx) {
            printf("%016llx ", u[idx]);
        }
        printf("\n");

        // Correction if borrow occurred
        if (borrow) {
            q_hat--;
            uint8_t carry2 = 0;
            for (size_t i = 0; i <= k; ++i) {
                uint64_t sum;
                uint64_t addend = (i < k) ? v[i] : 0;
                carry2 = _addcarry_u64(carry2, u[j + i], addend, &sum);
                u[j + i] = sum;
            }

            // Debug after correction
            printf("  Correction applied, q_hat decremented to %016llx\n", q_hat);
            printf("  u[%zu..%zu] after correction = ", j, j + k);
            for (size_t idx = j; idx <= j + k; ++idx) {
                printf("%016llx ", u[idx]);
            }
            printf("\n");
        }

        // Debug printout final q_hat
        printf("  Final q_hat=%016llx\n", q_hat);

        q[j] = q_hat;
    }

    // Denormalize
    if (shift > 0) {
        big_integer_shift_right_in_place(u, u_len, shift);
    }

    // Debug: Print final u
    printf("\nFinal u after denormalization (u_len = %zu):\n", u_len);
    for (size_t i = 0; i < u_len; ++i) {
        printf("%016llx ", u[i]);
    }
    printf("\n");

    // The modulus is in u[0..k-1]
    memcpy(dst, u, k * sizeof(uint64_t));
    memset(dst + k, 0, (dst_len - k) * sizeof(uint64_t));
}

int main() {
    const size_t size = 1024 * 1024; // Adjust size as needed

    uint64_t* POLE = (uint64_t*)VirtualAlloc(NULL, size * sizeof(uint64_t), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);

    if (!POLE) {
        printf("Memory allocation failed.\n");
        return -1;
    }

    size_t dst_len = 16;
    size_t src_len = 9;

    uint64_t* dst = POLE;
    uint64_t* src = dst + dst_len;
    uint64_t* expected_modulus = src + src_len;
    uint64_t* tmp = expected_modulus + src_len;

    uint64_t dst_init[16] = {
        0xe180d8077d300002ULL, 0x7b625008a4fdb542ULL, 0xe9265e2d44f0d9cfULL, 0x86c153dde116ab13ULL,
        0x1c432374cb51e3beULL, 0x37109f09c0bb25e3ULL, 0x4e4e0317cc2e7e96ULL, 0xe7fb642df0da0ec7ULL,
        0xa073e694d0335847ULL, 0x6f531c9bc39a4535ULL, 0xaa72054f943acfd8ULL, 0x3ee329ad967df484ULL,
        0x7b6c1bf56a1b4541ULL, 0x66329dc0cbc5bcf8ULL, 0x96780f8534f0ba1dULL, 0x0000000000001b8cULL
    };

    memcpy(dst, dst_init, dst_len * sizeof(uint64_t));

    uint64_t src_init[9] = {
        0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
        0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
        0x00000000000001ffULL
    };

    memcpy(src, src_init, src_len * sizeof(uint64_t));

    uint64_t expected_modulus_init[9] = {
        0x6529f3aed3debafdULL, 0x55ac347acd93cac2ULL, 0xa0d6b8378fae5bd5ULL, 0x6b43594fb2429f10ULL,
        0x8ff4c08cf314a0e6ULL, 0x0e2e2701d6abd260ULL, 0x4768fb7a28b4690bULL, 0xf16dd49eee538fc1ULL,
        0x0000000000000103ULL
    };

    memcpy(expected_modulus, expected_modulus_init, src_len * sizeof(uint64_t));

    mod(dst, src, tmp, dst_len, src_len);

    if (memcmp(dst, expected_modulus, src_len * sizeof(uint64_t)) == 0) {
        printf("MOD function works correctly.\n");
    }
    else {
        printf("MOD function failed.\n");
        printf("Expected:\n");
        for (size_t i = 0; i < src_len; ++i) {
            printf("%016llx ", expected_modulus[i]);
        }
        printf("\nActual:\n");
        for (size_t i = 0; i < src_len; ++i) {
            printf("%016llx ", dst[i]);
        }
        printf("\n");
    }

    VirtualFree(POLE, 0, MEM_RELEASE);

    return 0;
}

A x64 asm, externí funkce udiv128.asm na 128 bitové dělení viz odkaz nahoře. Kód jsem jen lehce upravil o uložení pár registrů na stoh a jejich vrácení zpět (final_ret: je finální návrat z funkce). __udivmodti4() function for AMD64 processors, supporting the Microsoft calling convention, using the extended precision division algorithm:

; Copyright © 2004-2025, Stefan Kanthak <‍stefan‍.‍kanthak‍@‍nexgo‍.‍de‍>

; Microsoft calling convention for AMD64 platform:
; - first 4 integer or pointer arguments (from left to right) are passed
;   in registers RCX/R1, RDX/R2, R8 and R9;
; - 16-byte arguments are passed by reference;
; - surplus arguments are pushed on stack in reverse order (from right
;   to left), 8-byte aligned;
; - caller allocates memory for 16-byte return value and passes pointer
;   to it as (hidden) first argument, thus shifting all other arguments;
; - caller always allocates "home space" for 4 arguments on stack,
;   even when less than 4 arguments are passed, but does not need to push
;   first 4 arguments;
; - callee can spill first 4 arguments from registers to "home space";
; - callee can clobber "home space";
; - stack is 16-byte aligned: callee must decrement RSP by 8+n*16
;   bytes when it calls other functions (CALL instruction pushes 8 bytes);
; - 64-bit integer or pointer result is returned in register RAX/R0;
; - 32-bit integer result is returned in register EAX;
; - registers RAX/R0, RCX/R1, RDX/R2, R8, R9, R10 and R11 are volatile
;   and can be clobbered;
; - registers RBX/R3, RSP/R4, RBP/R5, RSI/R6, RDI/R7, R12, R13, R14 and
;   R15 must be preserved.

; NOTE: raises "division exception" when divisor is 0!


                                  ; rcx = oword ptr quotient
                                  ; rdx = oword ptr dividend
                                  ; r8 = oword ptr divisor
                                  ; r9 = oword ptr remainder

.code


udiv128 proc

        push rbx        ; Preserve nonvolatile registers
        push r12
        push r13
        push r14

        mov     rax, [rdx]      ; rax = low qword of dividend
        mov     rdx, [rdx+8]; rdx = high qword of dividend

        mov     r10, [r8]       ; r10 = low qword of divisor
        mov     r11, [r8+8]     ; r11 = high qword of divisor

        mov     r8, rcx         ; r8 = address of quotient

        cmp     rax, r10
        mov     rcx, rdx
        sbb     rcx, r11
        jb      trivial         ; dividend < divisor?

        bsr     rcx, r11        ; rcx = index of most significant '1' bit
                                ;        in high qword of divisor
        jnz     extended        ; high qword of divisor <> 0?

        ; divisor < 2**64

        cmp     rdx, r10
        jae     long            ; high qword of dividend >= (low qword of) divisor?

        ; dividend < divisor * 2**64: quotient < 2**64
        ; perform normal division
normal:
        div     r10             ; rax = (low qword of) quotient,
                                ; rdx = (low qword of) remainder
        mov     [r8], rax
        mov     [r8+8], r11     ; high qword of quotient = 0

        test    r9, r9
        jz      @f              ; address of remainder = 0?

        mov     [r9], rdx
        mov     [r9+8], r11     ; high qword of remainder = 0
@@:
        mov     rax, r8         ; rax = address of quotient
        jmp final_ret
        ;ret

        ; dividend >= divisor * 2**64: quotient >= 2**64
        ; perform "long" alias "schoolbook" division
long:
        mov     rcx, rax        ; rcx = low qword of dividend
        mov     rax, rdx        ; rax = high qword of dividend
        mov     rdx, r11        ; rdx:rax = high qword of dividend
        div     r10             ; rax = high qword of quotient,
                                ; rdx = high qword of remainder'
        xchg    rcx, rax        ; rcx = high qword of quotient,
                                ; rax = low qword of dividend
        div     r10             ; rax = low qword of quotient,
                                ; rdx = (low qword of) remainder
        mov     [r8], rax
        mov     [r8+8], rcx     ; quotient = rcx:rax

        test    r9, r9
        jz      @f              ; address of remainder = 0?

        mov     [r9], rdx
        mov     [r9+8], r11     ; high qword of remainder = 0
@@:
        mov     rax, r8         ; rax = address of quotient
        jmp final_ret
        ;ret

        ; dividend < divisor: quotient = 0, remainder = dividend
trivial:
        xor     ecx, ecx
        mov     [r8], rcx
        mov     [r8+8], rcx     ; quotient = 0

        test    r9, r9
        jz      @f              ; address of remainder = 0?

        mov     [r9], rax
        mov     [r9+8], rdx     ; remainder = dividend
@@:
        mov     rax, r8         ; rax = address of quotient
        jmp final_ret
        ;ret

        ; divisor >= 2**64: quotient < 2**64
extended:
        xor     ecx, 63         ; ecx = number of leading '0' bits
                                ;        in (high qword of) divisor
        jz      special         ; divisor >= 2**127?

        ; perform "extended & adjusted" division

        mov     [rsp+8], rbx
        mov     [rsp+16], r12
        mov     [rsp+24], r13
        mov     [rsp+32], r14

        mov     r12, r11        ; r12 = high qword of divisor
        shld    r12, r10, cl    ; r12 = divisor / 2**(index + 1)
                                ;     = divisor'
;;      shl     r10, cl

        mov     r13, rax
        mov     r14, rdx        ; r14:r13 = high qword of dividend
ifndef JccLess
        xor     ebx, ebx        ; rbx = high qword of quotient' = 0

        cmp     rdx, r12
        jb      @f              ; high qword of dividend < divisor'?

        ; high qword of dividend >= divisor':
        ; subtract divisor' from high qword of dividend to prevent possible
        ; quotient overflow and set most significant bit of quotient"

        sub     rdx, r12        ; rdx = high qword of dividend - divisor'
                                ;     = high qword of dividend'
        inc     ebx             ; rbx = high qword of quotient' = 1
@@:
elseif 0
        sub     rdx, r12        ; rdx = high qword of dividend - divisor'
        sbb     rbx, rbx        ; rbx = (high qword of dividend < divisor') ? -1 : 0
        and     rbx, r12        ; rbx = (high qword of dividend < divisor') ? divisor' : 0
        add     rdx, rbx        ; rdx = high qword of dividend
                                ;     - (high qword of dividend < divisor') ? 0 : divisor'
                                ;     = high qword of dividend'
        neg     rbx             ; CF = (high qword of dividend < divisor')
        sbb     ebx, ebx        ; ebx = (high qword of dividend < divisor') ? -1 : 0
        inc     ebx             ; rbx = (high qword of dividend < divisor') ? 0 : 1
                                ;     = high qword of quotient'
elseif 0
        sub     rdx, r12        ; rdx = high qword of dividend - divisor'
        cmovb   rdx, r14        ;     = high qword of dividend'
        sbb     ebx, ebx        ; ebx = (high qword of dividend < divisor') ? -1 : 0
        inc     ebx             ; rbx = (high qword of dividend < divisor') ? 0 : 1
                                ;     = high qword of quotient'
else ; JccLess
        xor     ebx, ebx        ; rbx = high qword of quotient' = 0
        sub     rdx, r12        ; rdx = high qword of dividend - divisor'
        cmovb   rdx, r14        ;     = high qword of dividend'
        sbb     ebx, -1         ; rbx = (high qword of dividend < divisor') ? 0 : 1
                                ;     = high qword of quotient'
endif ; JccLess
        ; high qword of dividend' < divisor'

        div     r12             ; rax = dividend' / divisor'
                                ;     = low qword of quotient',
                                ; rdx = remainder'
        shld    rbx, rax, cl    ; rbx = quotient' / 2**(index + 1)
                                ;     = dividend / divisor'
                                ;     = quotient"
;;      shl     rax, cl
ifndef JccLess
        mov     rax, r10        ; rax = low qword of divisor
        mov     r12, r11        ; r12 = high qword of divisor
        imul    r12, rbx        ; r12 = high qword of divisor * quotient"
        mul     rbx             ; rdx:rax = low qword of divisor * quotient"
        add     rdx, r12        ; rdx:rax = divisor * quotient"
        jnc     @f              ; divisor * quotient" < 2**64?
                                ;  (with carry, it is off by divisor,
                                ;   and quotient" is off by 1)
if 0
        sbb     rbx, 0          ; rbx = quotient" - 1
else
        dec     rbx             ; rbx = quotient" - 1
endif
        sub     rax, r10
        sbb     rdx, r11        ; rdx:rax = divisor * (quotient" - 1)
@@:
        sub     r13, rax
        sbb     r14, rdx        ; r14:r13 = dividend - divisor * quotient"
                                ;         = remainder"
else ; JccLess
        mov     rax, r10        ; rax = low qword of divisor
        mov     r12, r11        ; r12 = high qword of divisor
        imul    r12, rbx        ; r12 = high qword of divisor * quotient"
        mul     rbx             ; rdx:rax = low qword of divisor * quotient"
        sub     r13, rax
        sbb     r14, rdx        ; r14:r13 = dividend
                                ;         - low qword of divisor * quotient"
        sub     r14, r12        ; r14:r13 = dividend - divisor * quotient"
                                ;         = remainder"
endif ; JccLess
        jnb     @f              ; remainder" >= 0?
                                ;  (with borrow, it is off by divisor,
                                ;   and quotient" is off by 1)
if 0
        sbb     rbx, 0          ; rbx = quotient" - 1
                                ;     = quotient
else
        dec     rbx             ; rbx = quotient" - 1
                                ;     = quotient
endif
        add     r13, r10
        adc     r14, r11        ; r14:r13 = remainder" + divisor
                                ;         = remainder
@@:
        xor     eax, eax        ; rax = high qword of quotient = 0
        mov     [r8], rbx
        mov     [r8+8], rax     ; quotient = rax:rbx

        test    r9, r9
        jz      @f              ; address of remainder = 0?

        mov     [r9], r13
        mov     [r9+8], r14     ; remainder = r14:r13
@@:
        mov     rbx, [rsp+8]
        mov     r12, [rsp+16]
        mov     r13, [rsp+24]
        mov     r14, [rsp+32]

        mov     rax, r8         ; rax = address of quotient
        jmp final_ret
        ;ret

        ; dividend >= divisor >= 2**127:
        ; quotient = 1, remainder = dividend - divisor
special:
        mov     [r8+8], rcx
        inc     ecx
        mov     [r8], rcx       ; quotient = 1

        test    r9, r9
        jz      @f              ; address of remainder = 0?

        sub     rax, r10
        sbb     rdx, r11        ; rdx:rax = dividend - divisor
                                ;         = remainder
        mov     [r9], rax
        mov     [r9+8], rdx
@@:
        mov     rax, r8         ; rax = address of quotient
        ;ret

final_ret:
        pop r14        ; Restore nonvolatile registers
        pop r13
        pop r12
        pop rbx
        ret
udiv128 endp
end

\---

Nahoru Odpovědět

8.1.2025 20:59

JerryM

Člen

JerryM:9.1.2025 8:52

no někdy se vyplatí ten kod zahodit a začít úplně znova a uplně od začátku
https://stackoverflow.com/…-bit-integer

Nahoru Odpovědět

9.1.2025 8:52

JerryM

Člen

JerryM:9.1.2025 8:58

a nebo prostě používej Intel C++ compiler, tam to jde
https://stackoverflow.com/…l-c-compiler
https://www.quora.com/…bit-CPUs-GCC
https://www.ehu.eus/…c_ug_lnx.pdf

Nahoru Odpovědět

9.1.2025 8:58

Caster

Člen

Caster:9.1.2025 10:01

V těch 128 bitových operacích (dělení - ext. funkce, násobení, sčítání a odečítání - VS intrinsic) není žádný problém, to jsem si opakovaně zkontroloval. Problém je v ostatních částech kódu na MOD.

Nahoru Odpovědět

9.1.2025 10:01

JerryM

Člen

JerryM:9.1.2025 10:56

já ti rozumim, ale obávám se, že procházení tvého kodu krok za krokem by mi zabralo asi tak století a tolik času nemám budeš muset použít debug a výpis výsledků do textového okna krok za krokem - tzv. atomizace kroků.
jinak Intel C/C++ opravdu MÁ 128bit aritmetiku ...

Nahoru Odpovědět

9.1.2025 10:56

Caster

Člen

Caster:9.1.2025 12:11

openai o1 už na tom pracuje. Musel jsem jí trochu postrčit. Přišla na to, že hned na začátku špatně provádí normalizaci, když jí vyjde posun vlevo 0 tj. uint64_t je 0xFFFFFFFFFFFFFFFF. Už to opravila s tím, že udělá posun vlevo alespoň o jeden bit.

Teď do kódu přidá debug výpisy a určitě to dám s AI do pořádku ;-) .

Nahoru Odpovědět

9.1.2025 12:11

JerryM

Člen

JerryM:9.1.2025 13:35

no já bych si vybral ten Intel C/C++ kompiler ...
AI sem k programování nikdy nepoužil, je to opravdu riziko je to jenom jazykový model ne skutečná AI s IQ 300 z místa

Nahoru Odpovědět

9.1.2025 13:35

Caster

Člen

Caster:9.1.2025 16:37

Už mi to pomohlo v dost věcech a to vč. programování ve Visual Basicu pro Excel, Pythonu a programu v C++ (MPLAB X IDE) pro řízení el. ploténky podle zadané teplotní křivky pro zapájení SMD součástek pro MCU Microchip SAMD21J18 (32bit) vč. výpočtu parametrů PID řízení na základě změřené teplotní charakteristiky el. vařiče ;-) .

Samozřejmě je nutné těm programům rozumět, umět říci co po AI chceš a v čem programuješ a pak si ten kód umět zkontrolovat a ověřit, že počítá správně. V naprosté většině případů je to správně na správně na 1. pokus. Jen v tomto konkrétním případě, kdy potřebuji spočítat MOD velkého čísla (ve finále má 40 mil. desítkových číslic) to jde trochu ztuha.

Někdy AI zapomene, že programuji ve Visual Studiu a ne v Linuxu, tak jí Visual Studio musím stále připomínat. AI zadávám i výstupy mého programu ideálně s ladícími výpisy, které tam na mou žádost sama přidá, a AI mi pak vypíše, co udělala chybně, opraví části programu s vysvětlením a na přání mi vypíše celý opravený program :-) .

Pomocí speciální AI jsem dokonce prohnal snímky magnetické rezonance mého kolene, kdy mi pak AI ještě dříve, než jsem se to dozvěděl od lékaře napsala, že mám natržený vnitřní roh menisku pravého kolene a doporučuje mi artroskopii. Už jsem ji absolvoval a jsem teď v pohodě :-) .

Nahoru Odpovědět

9.1.2025 16:37

JerryM

Člen

JerryM:9.1.2025 17:03

skvělý ... prostě AI boduje

Nahoru Odpovědět

9.1.2025 17:03

Caster

Člen

Caster:11.1.2025 10:25

S AI jsem to konečně úspěšně dořešil. Byly tam dva problémy.

AI mi s několika chybami špatně převedla výpis z paměti děleného čísla na shluky uint64_t, abych mohl zadat velké číslo do programu. Musel jsem číslo zadat ručně.
Přestože AI věděla, že vytváří program pro Visual Studio, napadlo mě, že pracuje nesprávně s čísly uloženými v paměti, což se potvrdilo. Pracovala s formátem big-endian, ale čísla byla uložena ve tvaru little-endia tj. nejméně významný byte nejdříve.

Místo mé externí funkce na dělení 128bitových čísel (zjistil jsem, že v prvních verzích programu někdy došlo k tomu, že byl výsledek dělení větší než 64 bitů, má externí funkce s tím počítala a počítala správně) ji AI nahradila funkcí _udiv128, která dělí 128bitové číslo 64bitovým.

Nyní program funguje tak jak má a správně spočítá výsledek zbytku po dělení velkých čísel. Problem solved ;-)

P.S. Na intenzivní práci s openai o1 preview jsem musel použít placenou Pro verzi. Verze zdarma umožňuje zadat jen jeden dotaz denně a po vyčerpání kreditu za cca 14 dnů je nutné přejít na placenou verzi.

Správný výstup programu:

Shift amount: 55
Before shift left:
00000000000001ff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff
After shift left by 55 bits:
ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ff80000000000000
Before shift left:
0000000000000000 0000000000001b8c 96780fb363dff216 25a9f48824f07aa4 4e390b4886e2f705 ed0de76c166dcc5e f3e65e4c8be40b8c 93c8e4512c2aff07 52374ead5d75f703 f16dd49eee538fb3 811dbf724f027912 031b520792995a23 3dcda4074ed12f6a e84cd25bfc37682a 715cc508696869cf 8f625008a4fdb542 e180d8077d300002
After shift left by 55 bits:
000000000000000d c64b3c07d9b1eff9 0b12d4fa4412783d 52271c85a443717b 82f686f3b60b36e6 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 d9c08edfb927813c 89018da903c94cad 119ee6d203a76897 b57426692dfe1bb4 1538ae628434b434 e7c7b12804527eda a170c06c03be9800 0100000000000000
Normalized v (k = 9):
ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff ff80000000000000
Normalized u (u_len = 17):
000000000000000d c64b3c07d9b1eff9 0b12d4fa4412783d 52271c85a443717b 82f686f3b60b36e6 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 d9c08edfb927813c 89018da903c94cad 119ee6d203a76897 b57426692dfe1bb4 1538ae628434b434 e7c7b12804527eda a170c06c03be9800 0100000000000000

Iteration 7:
Dividend: hi = 000000000000000d, lo = c64b3c07d9b1eff9
q_hat: 000000000000000d, r_hat: c64b3c07d9b1f006
After subtraction, u: 0000000000000000 c64b3c07d9b1eff9 0b12d4fa4412783d 52271c85a443717b 82f686f3b60b36e6 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 e0408edfb927813c
Stored q[7] = 000000000000000d

Iteration 6:
Dividend: hi = c64b3c07d9b1eff9, lo = 0b12d4fa4412783d
q_hat: c64b3c07d9b1eff9, r_hat: d15e11021dc46836
After subtraction, u: 0000000000000000 0b12d4fa4412783d 52271c85a443717b 82f686f3b60b36e6 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 e0a3b47dbd145a34 85818da903c94cad
Stored q[6] = c64b3c07d9b1eff9

Iteration 5:
Dividend: hi = 0b12d4fa4412783d, lo = 52271c85a443717b
q_hat: 0b12d4fa4412783d, r_hat: 5d39f17fe855e9b8
After subtraction, u: 0000000000000000 52271c85a443717b 82f686f3b60b36e6 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 301ee6d203a76897
Stored q[5] = 0b12d4fa4412783d

Iteration 4:
Dividend: hi = 52271c85a443717b, lo = 82f686f3b60b36e6
q_hat: 52271c85a443717b, r_hat: d51da3795a4ea861
After subtraction, u: 0000000000000000 82f686f3b60b36e6 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 3047fa6046798a50 72f426692dfe1bb4
Stored q[4] = 52271c85a443717b

Iteration 3:
Dividend: hi = 82f686f3b60b36e6, lo = 2f79f32f2645f205
q_hat: 82f686f3b60b36e6, r_hat: b2707a22dc5128eb
After subtraction, u: 0000000000000000 2f79f32f2645f205 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 3047fa6046798a50 7335a1aca7d9214f 8838ae628434b434
Stored q[3] = 82f686f3b60b36e6

Iteration 2:
Dividend: hi = 2f79f32f2645f205, lo = c649e4722896157f
q_hat: 2f79f32f2645f205, r_hat: f5c3d7a14edc0784
After subtraction, u: 0000000000000000 c649e4722896157f 83a91ba756aebafb 81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 3047fa6046798a50 7335a1aca7d9214f 88506b5c1bc7d72d ea47b12804527eda
Stored q[2] = 2f79f32f2645f205

Iteration 1:
Dividend: hi = c649e4722896157f, lo = 83a91ba756aebafb
q_hat: c649e4722896157f, r_hat: 49f300197f44d07a
After subtraction, u: 0000000000000000 83a91ba756aebafb 81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 3047fa6046798a50 7335a1aca7d9214f 88506b5c1bc7d72d eaaad61a3d66c9e5 60f0c06c03be9800
Stored q[1] = c649e4722896157f

Iteration 0:
Dividend: hi = 83a91ba756aebafb, lo = 81f8b6ea4f7729c7
q_hat: 83a91ba756aebafb, r_hat: 05a1d291a625e4c2
After subtraction, u: 0000000000000000 81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 3047fa6046798a50 7335a1aca7d9214f 88506b5c1bc7d72d eaaad61a3d66c9e5 613294f9d769ef5d 7e80000000000000
Stored q[0] = 83a91ba756aebafb

Denormalizing u by shifting right 55 bits
Before shift right:
81f8b6ea4f7729c7 e0a3b47dbd145a34 8587171380eb55e9 3047fa6046798a50 7335a1aca7d9214f 88506b5c1bc7d72d eaaad61a3d66c9e5 613294f9d769ef5d 7e80000000000000
After shift right by 55 bits:
0000000000000103 f16dd49eee538fc1 4768fb7a28b4690b 0e2e2701d6abd260 8ff4c08cf314a0e6 6b43594fb2429f10 a0d6b8378fae5bd5 55ac347acd93cac2 6529f3aed3debafd

Final u after denormalization (u_len = 17):
0000000000000103 f16dd49eee538fc1 4768fb7a28b4690b 0e2e2701d6abd260 8ff4c08cf314a0e6 6b43594fb2429f10 a0d6b8378fae5bd5 55ac347acd93cac2 6529f3aed3debafd
MOD function works correctly.

Plně funkční program:

#include <stdint.h>      // For uint64_t, uint8_t, ptrdiff_t
#include <stddef.h>      // For size_t, ptrdiff_t
#include <string.h>      // For memcpy, memset
#include <stdio.h>       // For printf
#include <intrin.h>      // For _umul128, _addcarry_u64, _subborrow_u64, _BitScanReverse64
#include <windows.h>     // For VirtualAlloc, VirtualFree

// Define uint128_t type using struct
typedef struct {
    uint64_t lo;
    uint64_t hi;
} uint128_t;

// Define ssize_t for Visual Studio
typedef ptrdiff_t ssize_t;

// Helper function to perform 128-bit division
void divide_128_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t* q, uint64_t* r) {
    // u1:u0 divided by v, q = quotient, r = remainder
    uint64_t qhat, rhat;
    if (u1 >= v) {
        // Overflow, quotient is maximum representable uint64_t value
        *q = ~(uint64_t)0;
        *r = u1 - v + 1;
        return;
    }
    // Compute qhat = (u1 * 2^64 + u0) / v
    // Note: __udiv128 divides (high:low) by v, returns quotient and remainder
    qhat = _udiv128(u1, u0, v, &rhat);
    *q = qhat;
    *r = rhat;
}

// Function to compare two uint128_t values
int compare_uint128(uint128_t a, uint128_t b) {
    if (a.hi != b.hi) {
        return (a.hi < b.hi) ? -1 : 1;
    }
    else {
        if (a.lo < b.lo) return -1;
        if (a.lo > b.lo) return 1;
        return 0;
    }
}

// Function to compare two big integers represented as arrays (little-endian)
int compare_big_int(uint64_t* a, size_t a_len, uint64_t* b, size_t b_len) {
    // Compare from most significant word to least significant
    ssize_t i;
    for (i = (ssize_t)(a_len > b_len ? a_len : b_len) - 1; i >= 0; --i) {
        uint64_t ai = (i < a_len) ? a[i] : 0;
        uint64_t bi = (i < b_len) ? b[i] : 0;
        if (ai != bi)
            return (ai > bi) ? 1 : -1;
    }
    return 0;
}


// Function to compare two big integers represented as arrays (little-endian)
// Handles different lengths
int compare_big_int_var(uint64_t* a, size_t a_len, uint64_t* b, size_t b_len) {
    size_t max_len = (a_len > b_len) ? a_len : b_len;
    for (ssize_t i = max_len - 1; i >= 0; --i) {
        uint64_t ai = (i < a_len) ? a[i] : 0;
        uint64_t bi = (i < b_len) ? b[i] : 0;
        if (ai != bi) {
            return (ai < bi) ? -1 : 1;
        }
    }
    return 0;
}

// External udiv128 function (you need to implement or provide this)
//extern "C" void udiv128(uint128_t* q, uint128_t* dividend, uint128_t* divisor, uint128_t* r);

// Shift left by 'shift_bits' bits in place (little-endian)
void big_integer_shift_left_in_place(uint64_t* a, size_t len, unsigned long shift_bits) {
    if (shift_bits == 0 || len == 0) return;

    size_t word_shift = shift_bits / 64;
    unsigned long bit_shift = shift_bits % 64;

    printf("Before shift left:\n");
    for (size_t i = len; i-- > 0;) {
        printf("%016llx ", a[i]);
    }
    printf("\n");

    if (word_shift >= len) {
        memset(a, 0, len * sizeof(uint64_t));
        return;
    }

    // Word shift
    if (word_shift > 0) {
        for (ssize_t i = len - 1; i >= 0; --i) {
            if ((size_t)i >= word_shift) {
                a[i] = a[i - word_shift];
            }
            else {
                a[i] = 0;
            }
        }
    }

    // Bit shift
    if (bit_shift > 0) {
        uint64_t carry = 0;
        for (size_t i = 0; i < len; ++i) {
            uint64_t temp = a[i];
            a[i] = (temp << bit_shift) | carry;
            carry = temp >> (64 - bit_shift);
        }
    }

    printf("After shift left by %lu bits:\n", shift_bits);
    for (size_t i = len; i-- > 0;) {
        printf("%016llx ", a[i]);
    }
    printf("\n");
}

// Corrected shift right by 'shift_bits' bits in place (little-endian)
void big_integer_shift_right_in_place(uint64_t* a, size_t len, unsigned long shift_bits) {
    if (shift_bits == 0 || len == 0) return;

    size_t word_shift = shift_bits / 64;
    unsigned long bit_shift = shift_bits % 64;

    printf("Before shift right:\n");
    for (size_t i = len; i-- > 0;) { // From most significant word to least
        printf("%016llx ", a[i]);
    }
    printf("\n");

    if (word_shift >= len) {
        // All bits are shifted out
        memset(a, 0, len * sizeof(uint64_t));
        return;
    }

    // Word shift
    if (word_shift > 0) {
        for (size_t i = 0; i < len - word_shift; ++i) {
            a[i] = a[i + word_shift];
        }
        memset(a + len - word_shift, 0, word_shift * sizeof(uint64_t));
    }

    // Bit shift
    if (bit_shift > 0) {
        uint64_t carry = 0;
        for (ssize_t i = len - 1; i >= 0; --i) { // From most significant word to least
            uint64_t temp = a[i];
            a[i] = (temp >> bit_shift) | (carry << (64 - bit_shift));
            carry = temp & ((1ULL << bit_shift) - 1);
        }
    }

    printf("After shift right by %lu bits:\n", shift_bits);
    for (size_t i = len; i-- > 0;) {
        printf("%016llx ", a[i]);
    }
    printf("\n");
}

// Corrected mod function with detailed debug printouts (little-endian)
void mod(uint64_t* dst, uint64_t* src, uint64_t* tmp, size_t dst_len, size_t src_len) {
    if (dst_len < src_len) {
        // The modulus is dst itself
        return;
    }

    if (src_len == 0) {
        printf("Error: Divisor has zero length.\n");
        return;
    }

    size_t n = dst_len;
    size_t k = src_len;

    // Initialize u
    size_t u_len = n + 1;
    uint64_t* u = tmp; // tmp must have enough space for u_len + k + (n - k + 2)
    memset(u, 0, u_len * sizeof(uint64_t));
    memcpy(u, dst, n * sizeof(uint64_t)); // Copy dst into u[0..n-1]

    // Initialize v
    uint64_t* v = u + u_len; // v has length k
    memcpy(v, src, k * sizeof(uint64_t));

    // Compute the shift amount
    unsigned long shift = 0;
    if (_BitScanReverse64(&shift, v[k - 1])) {
        shift = 63 - shift;
    }
    else {
        printf("Error: Divisor is zero.\n");
        return;
    }

    printf("Shift amount: %lu\n", shift);

    // Normalize v and u
    if (shift > 0) {
        big_integer_shift_left_in_place(v, k, shift);
        big_integer_shift_left_in_place(u, u_len, shift);
    }

    // Debug: Print normalized v
    printf("Normalized v (k = %zu):\n", k);
    for (size_t i = k; i-- > 0;) {
        printf("%016llx ", v[i]);
    }
    printf("\n");

    // Debug: Print normalized u
    printf("Normalized u (u_len = %zu):\n", u_len);
    for (size_t i = u_len; i-- > 0;) {
        printf("%016llx ", u[i]);
    }
    printf("\n");

    // Initialize q
    uint64_t* q = v + k; // q has length n - k + 1
    memset(q, 0, (n - k + 1) * sizeof(uint64_t));

    uint64_t v1 = v[k - 1];
    uint64_t v2 = (k > 1) ? v[k - 2] : 0;

    // Main loop
    for (ssize_t j = n - k; j >= 0; --j) {
        // Estimate q_hat

        uint64_t u_jk = u[j + k];     // u[j + k]
        uint64_t u_jk_1 = u[j + k - 1]; // u[j + k - 1]
        uint64_t u_jk_2 = (j + k - 2 >= 0) ? u[j + k - 2] : 0;

        uint64_t q_hat = 0;
        uint64_t r_hat = 0;

        divide_128_64(u_jk, u_jk_1, v1, &q_hat, &r_hat);

        // Adjust q_hat if necessary
        uint128_t lhs;
        uint128_t rhs;

        while (true) {
            // lhs = q_hat * (v1:v2)
            uint64_t mul_hi, mul_lo;
            mul_lo = _umul128(q_hat, v1, &mul_hi); // q_hat * v1

            lhs.hi = mul_hi;
            lhs.lo = mul_lo;

            // Construct comparison numerator
            uint128_t comp;
            comp.hi = r_hat;
            comp.lo = u_jk_2;

            if (compare_uint128(lhs, comp) <= 0) {
                break;
            }
            q_hat--;
            uint64_t prev_r_hat = r_hat;
            r_hat += v1;
            if (r_hat < prev_r_hat) { // Check for overflow
                break;
            }
        }

        printf("\nIteration %zd:\n", j);
        printf("Dividend: hi = %016llx, lo = %016llx\n", u_jk, u_jk_1);
        printf("q_hat: %016llx, r_hat: %016llx\n", q_hat, r_hat);

        // Multiply and subtract q_hat * v from u[j .. j + k]
        uint8_t borrow = 0;
        uint64_t carry = 0;
        for (size_t i = 0; i < k; ++i) {
            uint64_t prod_hi, prod_lo;
            prod_lo = _umul128(q_hat, v[i], &prod_hi);

            // Add previous carry to prod_lo
            uint64_t sum_lo;
            uint8_t carry_in = _addcarry_u64(0, prod_lo, carry, &sum_lo);

            // Update carry
            uint64_t sum_hi = prod_hi + carry_in;

            // Subtract sum_lo from u[j + i]
            uint64_t temp;
            borrow = _subborrow_u64(borrow, u[j + i], sum_lo, &temp);
            u[j + i] = temp;

            carry = sum_hi;
        }
        // Subtract the final carry
        uint64_t temp;
        borrow = _subborrow_u64(borrow, u[j + k], carry, &temp);
        u[j + k] = temp;

        // After subtraction, u:
        printf("After subtraction, u:");
        for (ssize_t i = j + k; i >= j; --i) {
            printf(" %016llx", u[i]);
        }
        printf("\n");

        // Correction if borrow occurred
        if (borrow) {
            printf("Borrow occurred, correcting u and q_hat\n");
            q_hat--;
            uint8_t carry2 = 0;
            for (size_t i = 0; i < k; ++i) {
                uint64_t sum;
                carry2 = _addcarry_u64(carry2, u[j + i], v[i], &sum);
                u[j + i] = sum;
            }
            // Add carry to u[j + k]
            uint64_t sum;
            carry2 = _addcarry_u64(carry2, u[j + k], 0, &sum);
            u[j + k] = sum;

            // After correction, u:
            printf("After correction, u:");
            for (ssize_t i = j + k; i >= j; --i) {
                printf(" %016llx", u[i]);
            }
            printf("\n");
        }

        q[j] = q_hat;
        printf("Stored q[%zd] = %016llx\n", j, q_hat);
    }

    // Denormalize
    if (shift > 0) {
        printf("\nDenormalizing u by shifting right %lu bits\n", shift);
        big_integer_shift_right_in_place(u, k, shift);
    }

    // Debug: Print final u
    printf("\nFinal u after denormalization (u_len = %zu):\n", u_len);
    for (size_t i = k; i-- > 0;) {
        printf("%016llx ", u[i]);
    }
    printf("\n");

    // The modulus is in u[0..k - 1]
    memcpy(dst, u, k * sizeof(uint64_t));
    memset(dst + k, 0, (dst_len - k) * sizeof(uint64_t));

    // Ensure the modulus is less than the divisor
    if (compare_big_int(dst, k, src, k) >= 0) {
        printf("Final subtraction since remainder >= divisor\n");
        uint8_t borrow = 0;
        for (size_t i = 0; i < k; ++i) {
            uint64_t temp;
            borrow = _subborrow_u64(borrow, dst[i], src[i], &temp);
            dst[i] = temp;
        }
    }
}

int main() {
    const size_t size = 1024 * 1024; // Adjust size as needed

    uint64_t* POLE = (uint64_t*)VirtualAlloc(NULL, size * sizeof(uint64_t), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);

    if (!POLE) {
        printf("Memory allocation failed.\n");
        return -1;
    }

    size_t dst_len = 16;
    size_t src_len = 9;

    uint64_t* dst = POLE;
    uint64_t* src = dst + dst_len;
    uint64_t* expected_modulus = src + src_len;
    uint64_t* tmp = expected_modulus + src_len;

    // Initialize dst (dividend) - little-endian
    uint64_t dst_init[16] = {
        0xe180d8077d300002ULL, 0x8f625008a4fdb542ULL, 0x715cc508696869cfULL, 0xe84cd25bfc37682aULL,
        0x3dcda4074ed12f6aULL, 0x031b520792995a23ULL, 0x811dbf724f027912ULL, 0xf16dd49eee538fb3ULL,
        0x52374ead5d75f703ULL, 0x93c8e4512c2aff07ULL, 0xf3e65e4c8be40b8cULL, 0xed0de76c166dcc5eULL,
        0x4e390b4886e2f705ULL, 0x25a9f48824f07aa4ULL, 0x96780fb363dff216ULL, 0x0000000000001b8cULL
    };

    memcpy(dst, dst_init, dst_len * sizeof(uint64_t));

    // Initialize src (divisor) - little-endian
    uint64_t src_init[9] = {
        0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
        0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
        0x00000000000001ffULL
    };

    memcpy(src, src_init, src_len * sizeof(uint64_t));

    // Initialize expected modulus - little-endian
    uint64_t expected_modulus_init[9] = {
        0x6529f3aed3debafdULL, 0x55ac347acd93cac2ULL, 0xa0d6b8378fae5bd5ULL, 0x6b43594fb2429f10ULL,
        0x8ff4c08cf314a0e6ULL, 0x0e2e2701d6abd260ULL, 0x4768fb7a28b4690bULL, 0xf16dd49eee538fc1ULL,
        0x0000000000000103ULL
    };

    memcpy(expected_modulus, expected_modulus_init, src_len * sizeof(uint64_t));

    // Call mod function
    mod(dst, src, tmp, dst_len, src_len);

    // Verify result
    if (memcmp(dst, expected_modulus, src_len * sizeof(uint64_t)) == 0) {
        printf("MOD function works correctly.\n");
    }
    else {
        printf("MOD function failed.\n");
        printf("Expected:\n");
        for (size_t i = src_len; i-- > 0;) {
            printf("%016llx ", expected_modulus[i]);
        }
        printf("\nActual:\n");
        for (size_t i = src_len; i-- > 0;) {
            printf("%016llx ", dst[i]);
        }
        printf("\n");
    }

    VirtualFree(POLE, 0, MEM_RELEASE);

    return 0;
}

Akceptované řešení
+5 Zkušeností

Nahoru Odpovědět

11.1.2025 10:25

Naučíme tě pracovat na home-office.

Zjistit více...

Děláme co je v našich silách, aby byly zdejší diskuze co nejkvalitnější. Proto do nich také mohou přispívat pouze registrovaní členové. Pro zapojení do diskuze se přihlas. Pokud ještě nemáš účet, zaregistruj se, je to zdarma.

Zobrazeno 13 zpráv z 13.

Nejčastěji vyhledáváné

Diskuze: MOD velkého unsigned integer čísla