Skip to content

Commit

Permalink
diff: add autoencode feature
Browse files Browse the repository at this point in the history
This feature auto-converts utf-16 and utf-32 content into
utf-8 for diffing, as long as:

  1. textconv is enabled

  2. the autoencode feature is enabled (which defaults to
     off for now)

  3. the file does not have a configured textconv filter

TODO:

  - documentation
  - tests
  - address the fixme

Signed-off-by: Jeff King <[email protected]>
  • Loading branch information
peff committed Dec 10, 2024
1 parent fac42c6 commit 811932d
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
43 changes: 42 additions & 1 deletion diff.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ static struct diff_options default_diff_options;
static long diff_algorithm;
static unsigned ws_error_highlight_default = WSEH_NEW;

static struct userdiff_textconv autoencode_textconv = { "autoencode" };

static char diff_colors[][COLOR_MAXLEN] = {
GIT_COLOR_RESET,
GIT_COLOR_NORMAL, /* CONTEXT */
Expand Down Expand Up @@ -3486,14 +3488,22 @@ struct userdiff_textconv *diff_get_textconv(struct repository *r,
struct diff_options *opt,
struct diff_filespec *one)
{
struct userdiff_textconv *textconv;

if (!opt->flags.allow_textconv)
return NULL;

if (!DIFF_FILE_VALID(one))
return NULL;

diff_filespec_load_driver(one, r->index);
return userdiff_get_textconv(r, one->driver);
textconv = userdiff_get_textconv(r, one->driver);

if (!textconv && opt->flags.allow_autoencode &&
diff_filespec_content_type(r, one) == DIFF_CONTENT_UTF)
textconv = &autoencode_textconv;

return textconv;
}

static struct string_list *additional_headers(struct diff_options *o,
Expand Down Expand Up @@ -5881,6 +5891,8 @@ struct option *add_diff_options(const struct option *opts,
OPT_CALLBACK_F(0, "textconv", options, NULL,
N_("run external text conversion filters when comparing binary files"),
PARSE_OPT_NOARG, diff_opt_textconv),
OPT_BOOL(0, "autoencode", &options->flags.allow_autoencode,
N_("allow automatic encoding conversion")),
OPT_CALLBACK_F(0, "ignore-submodules", options, N_("<when>"),
N_("ignore changes to submodules in the diff generation"),
PARSE_OPT_NONEG | PARSE_OPT_OPTARG,
Expand Down Expand Up @@ -7346,6 +7358,35 @@ size_t fill_textconv(struct repository *r,
return 0;
}

if (textconv == &autoencode_textconv) {
size_t outsize;
const char *from_encoding;

if (diff_populate_filespec(r, df, NULL))
die("unable to read files to diff");

from_encoding = buffer_has_utf_bom(df->data, df->size);
if (!from_encoding)
BUG("autoencode triggered for non-utf content");

*outbuf = reencode_string_len(df->data, df->size,
"UTF-8", from_encoding,
&outsize);

/*
* FIXME Our encoding guess failed. It's too late to return
* the original content, since the caller has already decided
* not to treat the contents as binary. But we could perhaps
* give some munged text form (e.g., by escaping high-bit
* characters and NULs).
*/
if (!*outbuf)
die_errno("unable to reencode from %s for path '%s'",
from_encoding, df->path);

return outsize;
}

if (!textconv) {
if (diff_populate_filespec(r, df, NULL))
die("unable to read files to diff");
Expand Down
1 change: 1 addition & 0 deletions diff.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ struct diff_flags {
unsigned dirstat_by_file;
unsigned allow_textconv;
unsigned textconv_set_via_cmdline;
unsigned allow_autoencode;
unsigned diff_from_contents;
unsigned dirty_submodules;
unsigned ignore_untracked_in_submodules;
Expand Down

0 comments on commit 811932d

Please sign in to comment.