From 6971934d9bb4b8176b48658482862169a4582913 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:13 +0000 Subject: [PATCH 01/10] doc: define unambiguous type mappings across C and Rust Document other nuances when crossing the FFI boundary. Other language mappings may be added in the future. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- Documentation/Makefile | 1 + Documentation/technical/meson.build | 1 + .../technical/unambiguous-types.adoc | 224 ++++++++++++++++++ 3 files changed, 226 insertions(+) create mode 100644 Documentation/technical/unambiguous-types.adoc diff --git a/Documentation/Makefile b/Documentation/Makefile index a3fbd29744..b580bdc98b 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -140,6 +140,7 @@ TECH_DOCS += technical/shallow TECH_DOCS += technical/sparse-checkout TECH_DOCS += technical/sparse-index TECH_DOCS += technical/trivial-merge +TECH_DOCS += technical/unambiguous-types TECH_DOCS += technical/unit-tests SP_ARTICLES += $(TECH_DOCS) SP_ARTICLES += technical/api-index diff --git a/Documentation/technical/meson.build b/Documentation/technical/meson.build index 858af811a7..7df9b8acf6 100644 --- a/Documentation/technical/meson.build +++ b/Documentation/technical/meson.build @@ -31,6 +31,7 @@ articles = [ 'sparse-checkout.adoc', 'sparse-index.adoc', 'trivial-merge.adoc', + 'unambiguous-types.adoc', 'unit-tests.adoc', ] diff --git a/Documentation/technical/unambiguous-types.adoc b/Documentation/technical/unambiguous-types.adoc new file mode 100644 index 0000000000..9a4990847c --- /dev/null +++ b/Documentation/technical/unambiguous-types.adoc @@ -0,0 +1,224 @@ += Unambiguous types + +Most of these mappings are obvious, but there are some nuances and gotchas with +Rust FFI (Foreign Function Interface). + +This document defines clear, one-to-one mappings between primitive types in C, +Rust (and possible other languages in the future). Its purpose is to eliminate +ambiguity in type widths, signedness, and binary representation across +platforms and languages. + +For Git, the only header required to use these unambiguous types in C is +`git-compat-util.h`. + +== Boolean types +[cols="1,1", options="header"] +|=== +| C Type | Rust Type +| bool^1^ | bool +|=== + +== Integer types + +In C, `` (or an equivalent) must be included. + +[cols="1,1", options="header"] +|=== +| C Type | Rust Type +| uint8_t | u8 +| uint16_t | u16 +| uint32_t | u32 +| uint64_t | u64 + +| int8_t | i8 +| int16_t | i16 +| int32_t | i32 +| int64_t | i64 +|=== + +== Floating-point types + +Rust requires IEEE-754 semantics. +In C, that is typically true, but not guaranteed by the standard. + +[cols="1,1", options="header"] +|=== +| C Type | Rust Type +| float^2^ | f32 +| double^2^ | f64 +|=== + +== Size types + +These types represent pointer-sized integers and are typically defined in +`` or an equivalent header. + +Size types should be used any time pointer arithmetic is performed e.g. +indexing an array, describing the number of elements in memory, etc... + +[cols="1,1", options="header"] +|=== +| C Type | Rust Type +| size_t^3^ | usize +| ptrdiff_t^3^ | isize +|=== + +== Character types + +This is where C and Rust don't have a clean one-to-one mapping. + +A C `char` and a Rust `u8` share the same bit width, so any C struct containing +a `char` will have the same size as the corresponding Rust struct using `u8`. +In that sense, such structs are safe to pass over the FFI boundary, because +their fields will be laid out identically. However, beyond bit width, C `char` +has additional semantics and platform-dependent behavior that can cause +problems, as discussed below. + +The C language leaves the signedness of `char` implementation defined. Because +our developer build enables -Wsign-compare, comparison of a value of `char` +type with either signed or unsigned integers may trigger warnings from the +compiler. + +Note: Rust's `char` type is an unsigned 32-bit integer that is used to describe +Unicode code points. + +=== Notes +^1^ This is only true if stdbool.h (or equivalent) is used. + +^2^ C does not enforce IEEE-754 compatibility, but Rust expects it. If the +platform/arch for C does not follow IEEE-754 then this equivalence does not +hold. Also, it's assumed that `float` is 32 bits and `double` is 64, but +there may be a strange platform/arch where even this isn't true. + +^3^ C also defines uintptr_t, ssize_t and intptr_t, but these types are +discouraged for FFI purposes. For functions like `read()` and `write()` ssize_t +should be cast to a different, and unambiguous, type before being passed over +the FFI boundary. + + +== Problems with std::ffi::c_* types in Rust +TL;DR: In practice, Rust's `c_*` types aren't guaranteed to match C types for +all possible C compilers, platforms, or architectures, because Rust only +ensures correctness of C types on officially supported targets. These +definitions have changed over time to match more targets which means that the +c_* definitions will differ based on which Rust version Git chooses to use. + +Current list of safe, Rust side, FFI types in Git: + + +* `c_void` +* `CStr` +* `CString` + +Even then, they should be used sparingly, and only where the semantics match +exactly. + +The std::os::raw::c_* directly inherits the problems of core::ffi, which +changes over time and seems to make a best guess at the correct definition for +a given platform/target. This probably isn't a problem for all other platforms +that Rust supports currently, but can anyone say that Rust got it right for all +C compilers of all platforms/targets? + +To give an example: c_long is defined in +footnote:[https://doc.rust-lang.org/1.63.0/src/core/ffi/mod.rs.html#175-189[c_long in 1.63.0]] +footnote:[https://doc.rust-lang.org/1.89.0/src/core/ffi/primitives.rs.html#135-151[c_long in 1.89.0]] + +=== Rust version 1.63.0 + +``` +mod c_long_definition { + cfg_if! { + if #[cfg(all(target_pointer_width = "64", not(windows)))] { + pub type c_long = i64; + pub type NonZero_c_long = crate::num::NonZeroI64; + pub type c_ulong = u64; + pub type NonZero_c_ulong = crate::num::NonZeroU64; + } else { + // The minimal size of `long` in the C standard is 32 bits + pub type c_long = i32; + pub type NonZero_c_long = crate::num::NonZeroI32; + pub type c_ulong = u32; + pub type NonZero_c_ulong = crate::num::NonZeroU32; + } + } +} +``` + +=== Rust version 1.89.0 + +``` +mod c_long_definition { + crate::cfg_select! { + any( + all(target_pointer_width = "64", not(windows)), + // wasm32 Linux ABI uses 64-bit long + all(target_arch = "wasm32", target_os = "linux") + ) => { + pub(super) type c_long = i64; + pub(super) type c_ulong = u64; + } + _ => { + // The minimal size of `long` in the C standard is 32 bits + pub(super) type c_long = i32; + pub(super) type c_ulong = u32; + } + } +} +``` + +Even for the cases where C types are correctly mapped to Rust types via +std::ffi::c_* there are still problems. Let's take c_char for example. On some +platforms it's u8 on others it's i8. + +=== Subtraction underflow in debug mode + +The following code will panic in debug on platforms that define c_char as u8, +but won't if it's an i8. + +``` +let mut x: std::ffi::c_char = 0; +x -= 1; +``` + +=== Inconsistent shift behavior + +`x` will be 0xC0 for platforms that use i8, but will be 0x40 where it's u8. + +``` +let mut x: std::ffi::c_char = 0x80; +x >>= 1; +``` + +=== Equality fails to compile on some platforms + +The following will not compile on platforms that define c_char as i8, but will +if it's u8. You can cast x e.g. `assert_eq!(x as u8, b'a');`, but then you get +a warning on platforms that use u8 and a clean compilation where i8 is used. + +``` +let mut x: std::ffi::c_char = 0x61; +assert_eq!(x, b'a'); +``` + +== Enum types +Rust enum types should not be used as FFI types. Rust enum types are more like +C union types than C enum's. For something like: + +``` +#[repr(C, u8)] +enum Fruit { + Apple, + Banana, + Cherry, +} +``` + +It's easy enough to make sure the Rust enum matches what C would expect, but a +more complex type like. + +``` +enum HashResult { + SHA1([u8; 20]), + SHA256([u8; 32]), +} +``` + +The Rust compiler has to add a discriminant to the enum to distinguish between +the variants. The width, location, and values for that discriminant is up to +the Rust compiler and is not ABI stable. From f007f4f4b473565fb2e94780028399030926bacb Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:14 +0000 Subject: [PATCH 02/10] xdiff: use ptrdiff_t for dstart/dend ptrdiff_t is appropriate for dstart and dend because they both describe positive or negative offsets relative to a pointer. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xtypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index f145abba3e..7a2d429ec5 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -47,7 +47,7 @@ typedef struct s_xrecord { typedef struct s_xdfile { xrecord_t *recs; long nrec; - long dstart, dend; + ptrdiff_t dstart, dend; bool *changed; long *rindex; long nreff; From 10f97d6affcb59bdcb74a6878d3d9da0eec81296 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:15 +0000 Subject: [PATCH 03/10] xdiff: make xrecord_t.ptr a uint8_t instead of char Make xrecord_t.ptr uint8_t because it's referring to bytes in memory. In order to avoid a refactor avalanche, many uses of this field were cast to char* or similar. Places where casting was unnecessary: xemit.c:156 xmerge.c:124 xmerge.c:127 xmerge.c:164 xmerge.c:169 xmerge.c:172 xmerge.c:178 Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 8 ++++---- xdiff/xemit.c | 6 +++--- xdiff/xmerge.c | 14 +++++++------- xdiff/xpatience.c | 2 +- xdiff/xprepare.c | 6 +++--- xdiff/xtypes.h | 2 +- xdiff/xutils.c | 4 ++-- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 6f3998ee54..95989b6af1 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -407,7 +407,7 @@ static int get_indent(xrecord_t *rec) int ret = 0; for (i = 0; i < rec->size; i++) { - char c = rec->ptr[i]; + char c = (char) rec->ptr[i]; if (!XDL_ISSPACE(c)) return ret; @@ -993,11 +993,11 @@ static void xdl_mark_ignorable_lines(xdchange_t *xscr, xdfenv_t *xe, long flags) rec = &xe->xdf1.recs[xch->i1]; for (i = 0; i < xch->chg1 && ignore; i++) - ignore = xdl_blankline(rec[i].ptr, rec[i].size, flags); + ignore = xdl_blankline((const char *)rec[i].ptr, rec[i].size, flags); rec = &xe->xdf2.recs[xch->i2]; for (i = 0; i < xch->chg2 && ignore; i++) - ignore = xdl_blankline(rec[i].ptr, rec[i].size, flags); + ignore = xdl_blankline((const char *)rec[i].ptr, rec[i].size, flags); xch->ignore = ignore; } @@ -1008,7 +1008,7 @@ static int record_matches_regex(xrecord_t *rec, xpparam_t const *xpp) { size_t i; for (i = 0; i < xpp->ignore_regex_nr; i++) - if (!regexec_buf(xpp->ignore_regex[i], rec->ptr, rec->size, 1, + if (!regexec_buf(xpp->ignore_regex[i], (const char *)rec->ptr, rec->size, 1, ®match, 0)) return 1; diff --git a/xdiff/xemit.c b/xdiff/xemit.c index b2f1f30cd3..ead930088a 100644 --- a/xdiff/xemit.c +++ b/xdiff/xemit.c @@ -27,7 +27,7 @@ static int xdl_emit_record(xdfile_t *xdf, long ri, char const *pre, xdemitcb_t * { xrecord_t *rec = &xdf->recs[ri]; - if (xdl_emit_diffrec(rec->ptr, rec->size, pre, strlen(pre), ecb) < 0) + if (xdl_emit_diffrec((char const *)rec->ptr, rec->size, pre, strlen(pre), ecb) < 0) return -1; return 0; @@ -113,8 +113,8 @@ static long match_func_rec(xdfile_t *xdf, xdemitconf_t const *xecfg, long ri, xrecord_t *rec = &xdf->recs[ri]; if (!xecfg->find_func) - return def_ff(rec->ptr, rec->size, buf, sz); - return xecfg->find_func(rec->ptr, rec->size, buf, sz, xecfg->find_func_priv); + return def_ff((const char *)rec->ptr, rec->size, buf, sz); + return xecfg->find_func((const char *)rec->ptr, rec->size, buf, sz, xecfg->find_func_priv); } static int is_func_rec(xdfile_t *xdf, xdemitconf_t const *xecfg, long ri) diff --git a/xdiff/xmerge.c b/xdiff/xmerge.c index fd600cbb5d..75cb3e76a2 100644 --- a/xdiff/xmerge.c +++ b/xdiff/xmerge.c @@ -101,8 +101,8 @@ static int xdl_merge_cmp_lines(xdfenv_t *xe1, int i1, xdfenv_t *xe2, int i2, xrecord_t *rec2 = xe2->xdf2.recs + i2; for (i = 0; i < line_count; i++) { - int result = xdl_recmatch(rec1[i].ptr, rec1[i].size, - rec2[i].ptr, rec2[i].size, flags); + int result = xdl_recmatch((const char *)rec1[i].ptr, rec1[i].size, + (const char *)rec2[i].ptr, rec2[i].size, flags); if (!result) return -1; } @@ -324,8 +324,8 @@ static int xdl_fill_merge_buffer(xdfenv_t *xe1, const char *name1, static int recmatch(xrecord_t *rec1, xrecord_t *rec2, unsigned long flags) { - return xdl_recmatch(rec1->ptr, rec1->size, - rec2->ptr, rec2->size, flags); + return xdl_recmatch((const char *)rec1->ptr, rec1->size, + (const char *)rec2->ptr, rec2->size, flags); } /* @@ -382,10 +382,10 @@ static int xdl_refine_conflicts(xdfenv_t *xe1, xdfenv_t *xe2, xdmerge_t *m, * we have a very simple mmfile structure. */ t1.ptr = (char *)xe1->xdf2.recs[m->i1].ptr; - t1.size = xe1->xdf2.recs[m->i1 + m->chg1 - 1].ptr + t1.size = (char *)xe1->xdf2.recs[m->i1 + m->chg1 - 1].ptr + xe1->xdf2.recs[m->i1 + m->chg1 - 1].size - t1.ptr; t2.ptr = (char *)xe2->xdf2.recs[m->i2].ptr; - t2.size = xe2->xdf2.recs[m->i2 + m->chg2 - 1].ptr + t2.size = (char *)xe2->xdf2.recs[m->i2 + m->chg2 - 1].ptr + xe2->xdf2.recs[m->i2 + m->chg2 - 1].size - t2.ptr; if (xdl_do_diff(&t1, &t2, xpp, &xe) < 0) return -1; @@ -440,7 +440,7 @@ static int line_contains_alnum(const char *ptr, long size) static int lines_contain_alnum(xdfenv_t *xe, int i, int chg) { for (; chg; chg--, i++) - if (line_contains_alnum(xe->xdf2.recs[i].ptr, + if (line_contains_alnum((const char *)xe->xdf2.recs[i].ptr, xe->xdf2.recs[i].size)) return 1; return 0; diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index 669b653580..bb61354f22 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -121,7 +121,7 @@ static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map, return; map->entries[index].line1 = line; map->entries[index].hash = record->ha; - map->entries[index].anchor = is_anchor(xpp, map->env->xdf1.recs[line - 1].ptr); + map->entries[index].anchor = is_anchor(xpp, (const char *)map->env->xdf1.recs[line - 1].ptr); if (!map->first) map->first = map->entries + index; if (map->last) { diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 192334f1b7..4c56467076 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -99,8 +99,8 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t hi = (long) XDL_HASHLONG(rec->ha, cf->hbits); for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next) if (rcrec->rec.ha == rec->ha && - xdl_recmatch(rcrec->rec.ptr, rcrec->rec.size, - rec->ptr, rec->size, cf->flags)) + xdl_recmatch((const char *)rcrec->rec.ptr, rcrec->rec.size, + (const char *)rec->ptr, rec->size, cf->flags)) break; if (!rcrec) { @@ -156,7 +156,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ if (XDL_ALLOC_GROW(xdf->recs, xdf->nrec + 1, narec)) goto abort; crec = &xdf->recs[xdf->nrec++]; - crec->ptr = prev; + crec->ptr = (uint8_t const *)prev; crec->size = (long) (cur - prev); crec->ha = hav; if (xdl_classify_record(pass, cf, crec) < 0) diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 7a2d429ec5..69727fb299 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -39,7 +39,7 @@ typedef struct s_chastore { } chastore_t; typedef struct s_xrecord { - char const *ptr; + uint8_t const *ptr; long size; unsigned long ha; } xrecord_t; diff --git a/xdiff/xutils.c b/xdiff/xutils.c index 447e66c719..7be063bfb6 100644 --- a/xdiff/xutils.c +++ b/xdiff/xutils.c @@ -465,10 +465,10 @@ int xdl_fall_back_diff(xdfenv_t *diff_env, xpparam_t const *xpp, xdfenv_t env; subfile1.ptr = (char *)diff_env->xdf1.recs[line1 - 1].ptr; - subfile1.size = diff_env->xdf1.recs[line1 + count1 - 2].ptr + + subfile1.size = (char *)diff_env->xdf1.recs[line1 + count1 - 2].ptr + diff_env->xdf1.recs[line1 + count1 - 2].size - subfile1.ptr; subfile2.ptr = (char *)diff_env->xdf2.recs[line2 - 1].ptr; - subfile2.size = diff_env->xdf2.recs[line2 + count2 - 2].ptr + + subfile2.size = (char *)diff_env->xdf2.recs[line2 + count2 - 2].ptr + diff_env->xdf2.recs[line2 + count2 - 2].size - subfile2.ptr; if (xdl_do_diff(&subfile1, &subfile2, xpp, &env) < 0) return -1; From 9bd193253c5e590203fc566ad7cff8f891ec0493 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:16 +0000 Subject: [PATCH 04/10] xdiff: use size_t for xrecord_t.size size_t is the appropriate type because size is describing the number of elements, bytes in this case, in memory. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 7 +++---- xdiff/xemit.c | 8 ++++---- xdiff/xmerge.c | 16 ++++++++-------- xdiff/xprepare.c | 6 +++--- xdiff/xtypes.h | 2 +- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 95989b6af1..cb8e412c7b 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -403,10 +403,9 @@ static int recs_match(xrecord_t *rec1, xrecord_t *rec2) */ static int get_indent(xrecord_t *rec) { - long i; int ret = 0; - for (i = 0; i < rec->size; i++) { + for (size_t i = 0; i < rec->size; i++) { char c = (char) rec->ptr[i]; if (!XDL_ISSPACE(c)) @@ -993,11 +992,11 @@ static void xdl_mark_ignorable_lines(xdchange_t *xscr, xdfenv_t *xe, long flags) rec = &xe->xdf1.recs[xch->i1]; for (i = 0; i < xch->chg1 && ignore; i++) - ignore = xdl_blankline((const char *)rec[i].ptr, rec[i].size, flags); + ignore = xdl_blankline((const char *)rec[i].ptr, (long)rec[i].size, flags); rec = &xe->xdf2.recs[xch->i2]; for (i = 0; i < xch->chg2 && ignore; i++) - ignore = xdl_blankline((const char *)rec[i].ptr, rec[i].size, flags); + ignore = xdl_blankline((const char *)rec[i].ptr, (long)rec[i].size, flags); xch->ignore = ignore; } diff --git a/xdiff/xemit.c b/xdiff/xemit.c index ead930088a..2f8007753c 100644 --- a/xdiff/xemit.c +++ b/xdiff/xemit.c @@ -27,7 +27,7 @@ static int xdl_emit_record(xdfile_t *xdf, long ri, char const *pre, xdemitcb_t * { xrecord_t *rec = &xdf->recs[ri]; - if (xdl_emit_diffrec((char const *)rec->ptr, rec->size, pre, strlen(pre), ecb) < 0) + if (xdl_emit_diffrec((char const *)rec->ptr, (long)rec->size, pre, strlen(pre), ecb) < 0) return -1; return 0; @@ -113,8 +113,8 @@ static long match_func_rec(xdfile_t *xdf, xdemitconf_t const *xecfg, long ri, xrecord_t *rec = &xdf->recs[ri]; if (!xecfg->find_func) - return def_ff((const char *)rec->ptr, rec->size, buf, sz); - return xecfg->find_func((const char *)rec->ptr, rec->size, buf, sz, xecfg->find_func_priv); + return def_ff((const char *)rec->ptr, (long)rec->size, buf, sz); + return xecfg->find_func((const char *)rec->ptr, (long)rec->size, buf, sz, xecfg->find_func_priv); } static int is_func_rec(xdfile_t *xdf, xdemitconf_t const *xecfg, long ri) @@ -151,7 +151,7 @@ static long get_func_line(xdfenv_t *xe, xdemitconf_t const *xecfg, static int is_empty_rec(xdfile_t *xdf, long ri) { xrecord_t *rec = &xdf->recs[ri]; - long i = 0; + size_t i = 0; for (; i < rec->size && XDL_ISSPACE(rec->ptr[i]); i++); diff --git a/xdiff/xmerge.c b/xdiff/xmerge.c index 75cb3e76a2..0dd4558a32 100644 --- a/xdiff/xmerge.c +++ b/xdiff/xmerge.c @@ -101,8 +101,8 @@ static int xdl_merge_cmp_lines(xdfenv_t *xe1, int i1, xdfenv_t *xe2, int i2, xrecord_t *rec2 = xe2->xdf2.recs + i2; for (i = 0; i < line_count; i++) { - int result = xdl_recmatch((const char *)rec1[i].ptr, rec1[i].size, - (const char *)rec2[i].ptr, rec2[i].size, flags); + int result = xdl_recmatch((const char *)rec1[i].ptr, (long)rec1[i].size, + (const char *)rec2[i].ptr, (long)rec2[i].size, flags); if (!result) return -1; } @@ -119,11 +119,11 @@ static int xdl_recs_copy_0(int use_orig, xdfenv_t *xe, int i, int count, int nee if (count < 1) return 0; - for (i = 0; i < count; size += recs[i++].size) + for (i = 0; i < count; size += (int)recs[i++].size) if (dest) memcpy(dest + size, recs[i].ptr, recs[i].size); if (add_nl) { - i = recs[count - 1].size; + i = (int)recs[count - 1].size; if (i == 0 || recs[count - 1].ptr[i - 1] != '\n') { if (needs_cr) { if (dest) @@ -156,7 +156,7 @@ static int xdl_orig_copy(xdfenv_t *xe, int i, int count, int needs_cr, int add_n */ static int is_eol_crlf(xdfile_t *file, int i) { - long size; + size_t size; if (i < file->nrec - 1) /* All lines before the last *must* end in LF */ @@ -324,8 +324,8 @@ static int xdl_fill_merge_buffer(xdfenv_t *xe1, const char *name1, static int recmatch(xrecord_t *rec1, xrecord_t *rec2, unsigned long flags) { - return xdl_recmatch((const char *)rec1->ptr, rec1->size, - (const char *)rec2->ptr, rec2->size, flags); + return xdl_recmatch((const char *)rec1->ptr, (long)rec1->size, + (const char *)rec2->ptr, (long)rec2->size, flags); } /* @@ -441,7 +441,7 @@ static int lines_contain_alnum(xdfenv_t *xe, int i, int chg) { for (; chg; chg--, i++) if (line_contains_alnum((const char *)xe->xdf2.recs[i].ptr, - xe->xdf2.recs[i].size)) + (long)xe->xdf2.recs[i].size)) return 1; return 0; } diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 4c56467076..b3219aed3e 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -99,8 +99,8 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t hi = (long) XDL_HASHLONG(rec->ha, cf->hbits); for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next) if (rcrec->rec.ha == rec->ha && - xdl_recmatch((const char *)rcrec->rec.ptr, rcrec->rec.size, - (const char *)rec->ptr, rec->size, cf->flags)) + xdl_recmatch((const char *)rcrec->rec.ptr, (long)rcrec->rec.size, + (const char *)rec->ptr, (long)rec->size, cf->flags)) break; if (!rcrec) { @@ -157,7 +157,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ goto abort; crec = &xdf->recs[xdf->nrec++]; crec->ptr = (uint8_t const *)prev; - crec->size = (long) (cur - prev); + crec->size = cur - prev; crec->ha = hav; if (xdl_classify_record(pass, cf, crec) < 0) goto abort; diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 69727fb299..354349b523 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -40,7 +40,7 @@ typedef struct s_chastore { typedef struct s_xrecord { uint8_t const *ptr; - long size; + size_t size; unsigned long ha; } xrecord_t; From b0d4ae30f5a23fa9da87e9396b78e6442b351ddc Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:17 +0000 Subject: [PATCH 05/10] xdiff: use unambiguous types in xdl_hash_record() Convert the function signature and body to use unambiguous types. char is changed to uint8_t because this function processes bytes in memory. unsigned long to uint64_t so that the hash output is consistent across platforms. `flags` was changed from long to uint64_t to ensure the high order bits are not dropped on platforms that treat long as 32 bits. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff-interface.c | 2 +- xdiff/xprepare.c | 6 +++--- xdiff/xutils.c | 28 ++++++++++++++-------------- xdiff/xutils.h | 6 +++--- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/xdiff-interface.c b/xdiff-interface.c index 4971f722b3..1a35556380 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -300,7 +300,7 @@ void xdiff_clear_find_func(xdemitconf_t *xecfg) unsigned long xdiff_hash_string(const char *s, size_t len, long flags) { - return xdl_hash_record(&s, s + len, flags); + return xdl_hash_record((uint8_t const**)&s, (uint8_t const*)s + len, flags); } int xdiff_compare_lines(const char *l1, long s1, diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index b3219aed3e..85e56021da 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -137,8 +137,8 @@ static void xdl_free_ctx(xdfile_t *xdf) static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp, xdlclassifier_t *cf, xdfile_t *xdf) { long bsize; - unsigned long hav; - char const *blk, *cur, *top, *prev; + uint64_t hav; + uint8_t const *blk, *cur, *top, *prev; xrecord_t *crec; xdf->rindex = NULL; @@ -156,7 +156,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ if (XDL_ALLOC_GROW(xdf->recs, xdf->nrec + 1, narec)) goto abort; crec = &xdf->recs[xdf->nrec++]; - crec->ptr = (uint8_t const *)prev; + crec->ptr = prev; crec->size = cur - prev; crec->ha = hav; if (xdl_classify_record(pass, cf, crec) < 0) diff --git a/xdiff/xutils.c b/xdiff/xutils.c index 7be063bfb6..77ee1ad9c8 100644 --- a/xdiff/xutils.c +++ b/xdiff/xutils.c @@ -249,11 +249,11 @@ int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags) return 1; } -unsigned long xdl_hash_record_with_whitespace(char const **data, - char const *top, long flags) { - unsigned long ha = 5381; - char const *ptr = *data; - int cr_at_eol_only = (flags & XDF_WHITESPACE_FLAGS) == XDF_IGNORE_CR_AT_EOL; +uint64_t xdl_hash_record_with_whitespace(uint8_t const **data, + uint8_t const *top, uint64_t flags) { + uint64_t ha = 5381; + uint8_t const *ptr = *data; + bool cr_at_eol_only = (flags & XDF_WHITESPACE_FLAGS) == XDF_IGNORE_CR_AT_EOL; for (; ptr < top && *ptr != '\n'; ptr++) { if (cr_at_eol_only) { @@ -263,8 +263,8 @@ unsigned long xdl_hash_record_with_whitespace(char const **data, continue; } else if (XDL_ISSPACE(*ptr)) { - const char *ptr2 = ptr; - int at_eol; + const uint8_t *ptr2 = ptr; + bool at_eol; while (ptr + 1 < top && XDL_ISSPACE(ptr[1]) && ptr[1] != '\n') ptr++; @@ -274,20 +274,20 @@ unsigned long xdl_hash_record_with_whitespace(char const **data, else if (flags & XDF_IGNORE_WHITESPACE_CHANGE && !at_eol) { ha += (ha << 5); - ha ^= (unsigned long) ' '; + ha ^= (uint64_t) ' '; } else if (flags & XDF_IGNORE_WHITESPACE_AT_EOL && !at_eol) { while (ptr2 != ptr + 1) { ha += (ha << 5); - ha ^= (unsigned long) *ptr2; + ha ^= (uint64_t) *ptr2; ptr2++; } } continue; } ha += (ha << 5); - ha ^= (unsigned long) *ptr; + ha ^= (uint64_t) *ptr; } *data = ptr < top ? ptr + 1: ptr; @@ -304,9 +304,9 @@ unsigned long xdl_hash_record_with_whitespace(char const **data, #define REASSOC_FENCE(x, y) #endif -unsigned long xdl_hash_record_verbatim(char const **data, char const *top) { - unsigned long ha = 5381, c0, c1; - char const *ptr = *data; +uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top) { + uint64_t ha = 5381, c0, c1; + uint8_t const *ptr = *data; #if 0 /* * The baseline form of the optimized loop below. This is the djb2 @@ -314,7 +314,7 @@ unsigned long xdl_hash_record_verbatim(char const **data, char const *top) { */ for (; ptr < top && *ptr != '\n'; ptr++) { ha += (ha << 5); - ha += (unsigned long) *ptr; + ha += (uint64_t) *ptr; } *data = ptr < top ? ptr + 1: ptr; #else diff --git a/xdiff/xutils.h b/xdiff/xutils.h index 13f6831047..615b4a9d35 100644 --- a/xdiff/xutils.h +++ b/xdiff/xutils.h @@ -34,9 +34,9 @@ void *xdl_cha_alloc(chastore_t *cha); long xdl_guess_lines(mmfile_t *mf, long sample); int xdl_blankline(const char *line, long size, long flags); int xdl_recmatch(const char *l1, long s1, const char *l2, long s2, long flags); -unsigned long xdl_hash_record_verbatim(char const **data, char const *top); -unsigned long xdl_hash_record_with_whitespace(char const **data, char const *top, long flags); -static inline unsigned long xdl_hash_record(char const **data, char const *top, long flags) +uint64_t xdl_hash_record_verbatim(uint8_t const **data, uint8_t const *top); +uint64_t xdl_hash_record_with_whitespace(uint8_t const **data, uint8_t const *top, uint64_t flags); +static inline uint64_t xdl_hash_record(uint8_t const **data, uint8_t const *top, uint64_t flags) { if (flags & XDF_WHITESPACE_FLAGS) return xdl_hash_record_with_whitespace(data, top, flags); From 6a26019c81faa07ba811541b4cf35be9e8ee1ead Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:18 +0000 Subject: [PATCH 06/10] xdiff: split xrecord_t.ha into line_hash and minimal_perfect_hash The ha field is serving two different purposes, which makes the code harder to read. At first glance, it looks like many places assume there could never be hash collisions between lines of the two input files. In reality, line_hash is used together with xdl_recmatch() to ensure correct comparisons of lines, even when collisions occur. To make this clearer, the old ha field has been split: * line_hash: a straightforward hash of a line, independent of any external context. Its type is uint64_t, as it comes from a fixed width hash function. * minimal_perfect_hash: Not a new concept, but now a separate field. It comes from the classifier's general-purpose hash table, which assigns each line a unique and minimal hash across the two files. A size_t is used here because it's meant to be used to index an array. This also avoids ` as usize` casts on the Rust side when using it to index a slice. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 6 +++--- xdiff/xhistogram.c | 4 ++-- xdiff/xpatience.c | 10 +++++----- xdiff/xprepare.c | 18 +++++++++--------- xdiff/xtypes.h | 3 ++- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index cb8e412c7b..8d96074414 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -22,9 +22,9 @@ #include "xinclude.h" -static unsigned long get_hash(xdfile_t *xdf, long index) +static size_t get_hash(xdfile_t *xdf, long index) { - return xdf->recs[xdf->rindex[index]].ha; + return xdf->recs[xdf->rindex[index]].minimal_perfect_hash; } #define XDL_MAX_COST_MIN 256 @@ -385,7 +385,7 @@ static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, static int recs_match(xrecord_t *rec1, xrecord_t *rec2) { - return (rec1->ha == rec2->ha); + return rec1->minimal_perfect_hash == rec2->minimal_perfect_hash; } /* diff --git a/xdiff/xhistogram.c b/xdiff/xhistogram.c index 6dc450b1fe..5ae1282c27 100644 --- a/xdiff/xhistogram.c +++ b/xdiff/xhistogram.c @@ -90,7 +90,7 @@ struct region { static int cmp_recs(xrecord_t *r1, xrecord_t *r2) { - return r1->ha == r2->ha; + return r1->minimal_perfect_hash == r2->minimal_perfect_hash; } @@ -98,7 +98,7 @@ static int cmp_recs(xrecord_t *r1, xrecord_t *r2) (cmp_recs(REC(i->env, s1, l1), REC(i->env, s2, l2))) #define TABLE_HASH(index, side, line) \ - XDL_HASHLONG((REC(index->env, side, line))->ha, index->table_bits) + XDL_HASHLONG((REC(index->env, side, line))->minimal_perfect_hash, index->table_bits) static int scanA(struct histindex *index, int line1, int count1) { diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index bb61354f22..cc53266f3b 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -48,7 +48,7 @@ struct hashmap { int nr, alloc; struct entry { - unsigned long hash; + size_t minimal_perfect_hash; /* * 0 = unused entry, 1 = first line, 2 = second, etc. * line2 is NON_UNIQUE if the line is not unique @@ -101,10 +101,10 @@ static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map, * So we multiply ha by 2 in the hope that the hashing was * "unique enough". */ - int index = (int)((record->ha << 1) % map->alloc); + int index = (int)((record->minimal_perfect_hash << 1) % map->alloc); while (map->entries[index].line1) { - if (map->entries[index].hash != record->ha) { + if (map->entries[index].minimal_perfect_hash != record->minimal_perfect_hash) { if (++index >= map->alloc) index = 0; continue; @@ -120,7 +120,7 @@ static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map, if (pass == 2) return; map->entries[index].line1 = line; - map->entries[index].hash = record->ha; + map->entries[index].minimal_perfect_hash = record->minimal_perfect_hash; map->entries[index].anchor = is_anchor(xpp, (const char *)map->env->xdf1.recs[line - 1].ptr); if (!map->first) map->first = map->entries + index; @@ -248,7 +248,7 @@ static int match(struct hashmap *map, int line1, int line2) { xrecord_t *record1 = &map->env->xdf1.recs[line1 - 1]; xrecord_t *record2 = &map->env->xdf2.recs[line2 - 1]; - return record1->ha == record2->ha; + return record1->minimal_perfect_hash == record2->minimal_perfect_hash; } static int patience_diff(xpparam_t const *xpp, xdfenv_t *env, diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 85e56021da..bea0992b5e 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -93,12 +93,12 @@ static void xdl_free_classifier(xdlclassifier_t *cf) { static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec) { - long hi; + size_t hi; xdlclass_t *rcrec; - hi = (long) XDL_HASHLONG(rec->ha, cf->hbits); + hi = XDL_HASHLONG(rec->line_hash, cf->hbits); for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next) - if (rcrec->rec.ha == rec->ha && + if (rcrec->rec.line_hash == rec->line_hash && xdl_recmatch((const char *)rcrec->rec.ptr, (long)rcrec->rec.size, (const char *)rec->ptr, (long)rec->size, cf->flags)) break; @@ -120,7 +120,7 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t (pass == 1) ? rcrec->len1++ : rcrec->len2++; - rec->ha = (unsigned long) rcrec->idx; + rec->minimal_perfect_hash = (size_t)rcrec->idx; return 0; } @@ -158,7 +158,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ crec = &xdf->recs[xdf->nrec++]; crec->ptr = prev; crec->size = cur - prev; - crec->ha = hav; + crec->line_hash = hav; if (xdl_classify_record(pass, cf, crec) < 0) goto abort; } @@ -290,7 +290,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { - rcrec = cf->rcrecs[recs->ha]; + rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len2 : 0; action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } @@ -298,7 +298,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { - rcrec = cf->rcrecs[recs->ha]; + rcrec = cf->rcrecs[recs->minimal_perfect_hash]; nm = rcrec ? rcrec->len1 : 0; action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } @@ -350,7 +350,7 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { recs2 = xdf2->recs; for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; i++, recs1++, recs2++) - if (recs1->ha != recs2->ha) + if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) break; xdf1->dstart = xdf2->dstart = i; @@ -358,7 +358,7 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { recs1 = xdf1->recs + xdf1->nrec - 1; recs2 = xdf2->recs + xdf2->nrec - 1; for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--) - if (recs1->ha != recs2->ha) + if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) break; xdf1->dend = xdf1->nrec - i - 1; diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 354349b523..d4e9cd2e76 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -41,7 +41,8 @@ typedef struct s_chastore { typedef struct s_xrecord { uint8_t const *ptr; size_t size; - unsigned long ha; + uint64_t line_hash; + size_t minimal_perfect_hash; } xrecord_t; typedef struct s_xdfile { From 016538780e9f6e83a1d9c7b0ec771fb6c5583c0f Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:19 +0000 Subject: [PATCH 07/10] xdiff: make xdfile_t.nrec a size_t instead of long size_t is used because nrec describes the number of elements for both recs, and for 'changed' + 2. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 8 ++++---- xdiff/xemit.c | 20 ++++++++++---------- xdiff/xmerge.c | 8 ++++---- xdiff/xpatience.c | 2 +- xdiff/xprepare.c | 12 ++++++------ xdiff/xtypes.h | 2 +- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 8d96074414..21d06bce96 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -483,7 +483,7 @@ static void measure_split(const xdfile_t *xdf, long split, { long i; - if (split >= xdf->nrec) { + if (split >= (long)xdf->nrec) { m->end_of_file = 1; m->indent = -1; } else { @@ -506,7 +506,7 @@ static void measure_split(const xdfile_t *xdf, long split, m->post_blank = 0; m->post_indent = -1; - for (i = split + 1; i < xdf->nrec; i++) { + for (i = split + 1; i < (long)xdf->nrec; i++) { m->post_indent = get_indent(&xdf->recs[i]); if (m->post_indent != -1) break; @@ -717,7 +717,7 @@ static void group_init(xdfile_t *xdf, struct xdlgroup *g) */ static inline int group_next(xdfile_t *xdf, struct xdlgroup *g) { - if (g->end == xdf->nrec) + if (g->end == (long)xdf->nrec) return -1; g->start = g->end + 1; @@ -750,7 +750,7 @@ static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g) */ static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g) { - if (g->end < xdf->nrec && + if (g->end < (long)xdf->nrec && recs_match(&xdf->recs[g->start], &xdf->recs[g->end])) { xdf->changed[g->start++] = false; xdf->changed[g->end++] = true; diff --git a/xdiff/xemit.c b/xdiff/xemit.c index 2f8007753c..04f7e9193b 100644 --- a/xdiff/xemit.c +++ b/xdiff/xemit.c @@ -137,7 +137,7 @@ static long get_func_line(xdfenv_t *xe, xdemitconf_t const *xecfg, buf = func_line ? func_line->buf : dummy; size = func_line ? sizeof(func_line->buf) : sizeof(dummy); - for (l = start; l != limit && 0 <= l && l < xe->xdf1.nrec; l += step) { + for (l = start; l != limit && 0 <= l && l < (long)xe->xdf1.nrec; l += step) { long len = match_func_rec(&xe->xdf1, xecfg, l, buf, size); if (len >= 0) { if (func_line) @@ -179,14 +179,14 @@ pre_context_calculation: long fs1, i1 = xch->i1; /* Appended chunk? */ - if (i1 >= xe->xdf1.nrec) { + if (i1 >= (long)xe->xdf1.nrec) { long i2 = xch->i2; /* * We don't need additional context if * a whole function was added. */ - while (i2 < xe->xdf2.nrec) { + while (i2 < (long)xe->xdf2.nrec) { if (is_func_rec(&xe->xdf2, xecfg, i2)) goto post_context_calculation; i2++; @@ -196,7 +196,7 @@ pre_context_calculation: * Otherwise get more context from the * pre-image. */ - i1 = xe->xdf1.nrec - 1; + i1 = (long)xe->xdf1.nrec - 1; } fs1 = get_func_line(xe, xecfg, NULL, i1, -1); @@ -228,8 +228,8 @@ pre_context_calculation: post_context_calculation: lctx = xecfg->ctxlen; - lctx = XDL_MIN(lctx, xe->xdf1.nrec - (xche->i1 + xche->chg1)); - lctx = XDL_MIN(lctx, xe->xdf2.nrec - (xche->i2 + xche->chg2)); + lctx = XDL_MIN(lctx, (long)xe->xdf1.nrec - (xche->i1 + xche->chg1)); + lctx = XDL_MIN(lctx, (long)xe->xdf2.nrec - (xche->i2 + xche->chg2)); e1 = xche->i1 + xche->chg1 + lctx; e2 = xche->i2 + xche->chg2 + lctx; @@ -237,13 +237,13 @@ pre_context_calculation: if (xecfg->flags & XDL_EMIT_FUNCCONTEXT) { long fe1 = get_func_line(xe, xecfg, NULL, xche->i1 + xche->chg1, - xe->xdf1.nrec); + (long)xe->xdf1.nrec); while (fe1 > 0 && is_empty_rec(&xe->xdf1, fe1 - 1)) fe1--; if (fe1 < 0) - fe1 = xe->xdf1.nrec; + fe1 = (long)xe->xdf1.nrec; if (fe1 > e1) { - e2 = XDL_MIN(e2 + (fe1 - e1), xe->xdf2.nrec); + e2 = XDL_MIN(e2 + (fe1 - e1), (long)xe->xdf2.nrec); e1 = fe1; } @@ -254,7 +254,7 @@ pre_context_calculation: */ if (xche->next) { long l = XDL_MIN(xche->next->i1, - xe->xdf1.nrec - 1); + (long)xe->xdf1.nrec - 1); if (l - xecfg->ctxlen <= e1 || get_func_line(xe, xecfg, NULL, l, e1) < 0) { xche = xche->next; diff --git a/xdiff/xmerge.c b/xdiff/xmerge.c index 0dd4558a32..29dad98c49 100644 --- a/xdiff/xmerge.c +++ b/xdiff/xmerge.c @@ -158,7 +158,7 @@ static int is_eol_crlf(xdfile_t *file, int i) { size_t size; - if (i < file->nrec - 1) + if (i < (long)file->nrec - 1) /* All lines before the last *must* end in LF */ return (size = file->recs[i].size) > 1 && file->recs[i].ptr[size - 2] == '\r'; @@ -317,7 +317,7 @@ static int xdl_fill_merge_buffer(xdfenv_t *xe1, const char *name1, continue; i = m->i1 + m->chg1; } - size += xdl_recs_copy(xe1, i, xe1->xdf2.nrec - i, 0, 0, + size += xdl_recs_copy(xe1, i, (int)xe1->xdf2.nrec - i, 0, 0, dest ? dest + size : NULL); return size; } @@ -622,7 +622,7 @@ static int xdl_do_merge(xdfenv_t *xe1, xdchange_t *xscr1, changes = c; i0 = xscr1->i1; i1 = xscr1->i2; - i2 = xscr1->i1 + xe2->xdf2.nrec - xe2->xdf1.nrec; + i2 = xscr1->i1 + (long)xe2->xdf2.nrec - (long)xe2->xdf1.nrec; chg0 = xscr1->chg1; chg1 = xscr1->chg2; chg2 = xscr1->chg1; @@ -637,7 +637,7 @@ static int xdl_do_merge(xdfenv_t *xe1, xdchange_t *xscr1, if (!changes) changes = c; i0 = xscr2->i1; - i1 = xscr2->i1 + xe1->xdf2.nrec - xe1->xdf1.nrec; + i1 = xscr2->i1 + (long)xe1->xdf2.nrec - (long)xe1->xdf1.nrec; i2 = xscr2->i2; chg0 = xscr2->chg1; chg1 = xscr2->chg1; diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c index cc53266f3b..a0b31eb5d8 100644 --- a/xdiff/xpatience.c +++ b/xdiff/xpatience.c @@ -370,5 +370,5 @@ static int patience_diff(xpparam_t const *xpp, xdfenv_t *env, int xdl_do_patience_diff(xpparam_t const *xpp, xdfenv_t *env) { - return patience_diff(xpp, env, 1, env->xdf1.nrec, 1, env->xdf2.nrec); + return patience_diff(xpp, env, 1, (int)env->xdf1.nrec, 1, (int)env->xdf2.nrec); } diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index bea0992b5e..705ddd1ae0 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -153,7 +153,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ for (top = blk + bsize; cur < top; ) { prev = cur; hav = xdl_hash_record(&cur, top, xpp->flags); - if (XDL_ALLOC_GROW(xdf->recs, xdf->nrec + 1, narec)) + if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec)) goto abort; crec = &xdf->recs[xdf->nrec++]; crec->ptr = prev; @@ -287,7 +287,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd /* * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE. */ - if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT) + if ((mlim = xdl_bogosqrt((long)xdf1->nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; @@ -295,7 +295,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP; } - if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT) + if ((mlim = xdl_bogosqrt((long)xdf2->nrec)) > XDL_MAX_EQLIMIT) mlim = XDL_MAX_EQLIMIT; for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { rcrec = cf->rcrecs[recs->minimal_perfect_hash]; @@ -348,7 +348,7 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { recs1 = xdf1->recs; recs2 = xdf2->recs; - for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; + for (i = 0, lim = (long)XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim; i++, recs1++, recs2++) if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) break; @@ -361,8 +361,8 @@ static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) { if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash) break; - xdf1->dend = xdf1->nrec - i - 1; - xdf2->dend = xdf2->nrec - i - 1; + xdf1->dend = (long)xdf1->nrec - i - 1; + xdf2->dend = (long)xdf2->nrec - i - 1; return 0; } diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index d4e9cd2e76..4c4d9bd147 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -47,7 +47,7 @@ typedef struct s_xrecord { typedef struct s_xdfile { xrecord_t *recs; - long nrec; + size_t nrec; ptrdiff_t dstart, dend; bool *changed; long *rindex; From e35877eadbd9bee473936577c82abca9c8333abd Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:20 +0000 Subject: [PATCH 08/10] xdiff: make xdfile_t.nreff a size_t instead of long size_t is used because nreff describes the number of elements in memory for rindex. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xprepare.c | 14 +++++++------- xdiff/xtypes.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 705ddd1ae0..39fd79d9d4 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -264,7 +264,7 @@ static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) { * might be potentially discarded if they appear in a run of discardable. */ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) { - long i, nm, nreff, mlim; + long i, nm, mlim; xrecord_t *recs; xdlclass_t *rcrec; uint8_t *action1 = NULL, *action2 = NULL; @@ -307,29 +307,29 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd * Use temporary arrays to decide if changed[i] should remain * false, or become true. */ - for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; + xdf1->nreff = 0; + for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) { if (action1[i] == KEEP || (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) { - xdf1->rindex[nreff++] = i; + xdf1->rindex[xdf1->nreff++] = i; /* changed[i] remains false, i.e. keep */ } else xdf1->changed[i] = true; /* i.e. discard */ } - xdf1->nreff = nreff; - for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; + xdf2->nreff = 0; + for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) { if (action2[i] == KEEP || (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) { - xdf2->rindex[nreff++] = i; + xdf2->rindex[xdf2->nreff++] = i; /* changed[i] remains false, i.e. keep */ } else xdf2->changed[i] = true; /* i.e. discard */ } - xdf2->nreff = nreff; cleanup: xdl_free(action1); diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 4c4d9bd147..1f495f987f 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -51,7 +51,7 @@ typedef struct s_xdfile { ptrdiff_t dstart, dend; bool *changed; long *rindex; - long nreff; + size_t nreff; } xdfile_t; typedef struct s_xdfenv { From 5004a8da14e2aa80b5697b0a3a60e594af1c8292 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:21 +0000 Subject: [PATCH 09/10] xdiff: change rindex from long to size_t in xdfile_t The field rindex describes an index offset for other arrays. Change it to size_t. Changing the type of rindex from long to size_t has no cascading refactor impact because it is only ever used to directly index other arrays. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xtypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 1f495f987f..9074cdadd1 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -50,7 +50,7 @@ typedef struct s_xdfile { size_t nrec; ptrdiff_t dstart, dend; bool *changed; - long *rindex; + size_t *rindex; size_t nreff; } xdfile_t; From 22ce0cb6397d3d15c21c217696f262c4b8eb44b3 Mon Sep 17 00:00:00 2001 From: Ezekiel Newren Date: Tue, 18 Nov 2025 22:34:22 +0000 Subject: [PATCH 10/10] xdiff: rename rindex -> reference_index The classic diff adds only the lines that it's going to consider, during the diff, to an array. A mapping between the compacted array, and the lines of the file that they reference, is facilitated by this array. Signed-off-by: Ezekiel Newren Signed-off-by: Junio C Hamano --- xdiff/xdiffi.c | 6 +++--- xdiff/xprepare.c | 10 +++++----- xdiff/xtypes.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 21d06bce96..4376f943db 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -24,7 +24,7 @@ static size_t get_hash(xdfile_t *xdf, long index) { - return xdf->recs[xdf->rindex[index]].minimal_perfect_hash; + return xdf->recs[xdf->reference_index[index]].minimal_perfect_hash; } #define XDL_MAX_COST_MIN 256 @@ -278,10 +278,10 @@ int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1, */ if (off1 == lim1) { for (; off2 < lim2; off2++) - xdf2->changed[xdf2->rindex[off2]] = true; + xdf2->changed[xdf2->reference_index[off2]] = true; } else if (off2 == lim2) { for (; off1 < lim1; off1++) - xdf1->changed[xdf1->rindex[off1]] = true; + xdf1->changed[xdf1->reference_index[off1]] = true; } else { xdpsplit_t spl; spl.i1 = spl.i2 = 0; diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 39fd79d9d4..34c82e4f8e 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -128,7 +128,7 @@ static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t static void xdl_free_ctx(xdfile_t *xdf) { - xdl_free(xdf->rindex); + xdl_free(xdf->reference_index); xdl_free(xdf->changed - 1); xdl_free(xdf->recs); } @@ -141,7 +141,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ uint8_t const *blk, *cur, *top, *prev; xrecord_t *crec; - xdf->rindex = NULL; + xdf->reference_index = NULL; xdf->changed = NULL; xdf->recs = NULL; @@ -169,7 +169,7 @@ static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_ if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) && (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) { - if (!XDL_ALLOC_ARRAY(xdf->rindex, xdf->nrec + 1)) + if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1)) goto abort; } @@ -312,7 +312,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd i <= xdf1->dend; i++, recs++) { if (action1[i] == KEEP || (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) { - xdf1->rindex[xdf1->nreff++] = i; + xdf1->reference_index[xdf1->nreff++] = i; /* changed[i] remains false, i.e. keep */ } else xdf1->changed[i] = true; @@ -324,7 +324,7 @@ static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xd i <= xdf2->dend; i++, recs++) { if (action2[i] == KEEP || (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) { - xdf2->rindex[xdf2->nreff++] = i; + xdf2->reference_index[xdf2->nreff++] = i; /* changed[i] remains false, i.e. keep */ } else xdf2->changed[i] = true; diff --git a/xdiff/xtypes.h b/xdiff/xtypes.h index 9074cdadd1..979586f20a 100644 --- a/xdiff/xtypes.h +++ b/xdiff/xtypes.h @@ -50,7 +50,7 @@ typedef struct s_xdfile { size_t nrec; ptrdiff_t dstart, dend; bool *changed; - size_t *rindex; + size_t *reference_index; size_t nreff; } xdfile_t;