* [PATCH v12 1/3] grep: refactor next_match() and match_one_pattern() for external use
@ 2021-10-08 22:49 Hamza Mahfooz
2021-10-08 22:49 ` [PATCH v12 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
2021-10-08 22:49 ` [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
0 siblings, 2 replies; 5+ messages in thread
From: Hamza Mahfooz @ 2021-10-08 22:49 UTC (permalink / raw)
To: git; +Cc: Junio C Hamano, Hamza Mahfooz
These changes are made in preparation of, the colorization support for the
"git log" subcommands that, rely on regex functionality (i.e. "--author",
"--committer" and "--grep"). These changes are necessary primarily because
match_one_pattern() expects header lines to be prefixed, however, in
pretty, the prefixes are stripped from the lines because the name-email
pairs need to go through additional parsing, before they can be printed and
because next_match() doesn't handle the case of
"ctx == GREP_CONTEXT_HEAD" at all. So, teach next_match() how to handle the
new case and move match_one_pattern()'s core logic to
headerless_match_one_pattern() while preserving match_one_pattern()'s uses
that depend on the additional processing.
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v5: separate grep changes from pretty changes.
v6: rescope some variables.
v7: export header_field[] and allow for subsequent matches on header lines
in match_one_pattern().
v8: add headerless_match_one_pattern() and move header_field[] back.
v9: get rid of the new check in headerless_match_one_pattern(), move the
header pattern filtering logic in grep_next_match() and document
grep_next_match() in grep.h.
v10: add a "magic" comment in grep_next_match() to signify a fall through
in the switch statement.
---
grep.c | 79 ++++++++++++++++++++++++++++++++++++----------------------
grep.h | 9 +++++++
2 files changed, 58 insertions(+), 30 deletions(-)
diff --git a/grep.c b/grep.c
index 14fe8a0fd2..fe847a0111 100644
--- a/grep.c
+++ b/grep.c
@@ -944,10 +944,10 @@ static struct {
{ "reflog ", 7 },
};
-static int match_one_pattern(struct grep_pat *p,
- const char *bol, const char *eol,
- enum grep_context ctx,
- regmatch_t *pmatch, int eflags)
+static int headerless_match_one_pattern(struct grep_pat *p,
+ const char *bol, const char *eol,
+ enum grep_context ctx,
+ regmatch_t *pmatch, int eflags)
{
int hit = 0;
const char *start = bol;
@@ -956,25 +956,6 @@ static int match_one_pattern(struct grep_pat *p,
((p->token == GREP_PATTERN_HEAD) != (ctx == GREP_CONTEXT_HEAD)))
return 0;
- if (p->token == GREP_PATTERN_HEAD) {
- const char *field;
- size_t len;
- assert(p->field < ARRAY_SIZE(header_field));
- field = header_field[p->field].field;
- len = header_field[p->field].len;
- if (strncmp(bol, field, len))
- return 0;
- bol += len;
- switch (p->field) {
- case GREP_HEADER_AUTHOR:
- case GREP_HEADER_COMMITTER:
- strip_timestamp(bol, &eol);
- break;
- default:
- break;
- }
- }
-
again:
hit = patmatch(p, bol, eol, pmatch, eflags);
@@ -1025,6 +1006,36 @@ static int match_one_pattern(struct grep_pat *p,
return hit;
}
+static int match_one_pattern(struct grep_pat *p,
+ const char *bol, const char *eol,
+ enum grep_context ctx, regmatch_t *pmatch,
+ int eflags)
+{
+ const char *field;
+ size_t len;
+
+ if (p->token == GREP_PATTERN_HEAD) {
+ assert(p->field < ARRAY_SIZE(header_field));
+ field = header_field[p->field].field;
+ len = header_field[p->field].len;
+ if (strncmp(bol, field, len))
+ return 0;
+ bol += len;
+
+ switch (p->field) {
+ case GREP_HEADER_AUTHOR:
+ case GREP_HEADER_COMMITTER:
+ strip_timestamp(bol, &eol);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return headerless_match_one_pattern(p, bol, eol, ctx, pmatch, eflags);
+}
+
+
static int match_expr_eval(struct grep_opt *opt, struct grep_expr *x,
const char *bol, const char *eol,
enum grep_context ctx, ssize_t *col,
@@ -1143,7 +1154,7 @@ static int match_next_pattern(struct grep_pat *p,
{
regmatch_t match;
- if (!match_one_pattern(p, bol, eol, ctx, &match, eflags))
+ if (!headerless_match_one_pattern(p, bol, eol, ctx, &match, eflags))
return 0;
if (match.rm_so < 0 || match.rm_eo < 0)
return 0;
@@ -1158,19 +1169,26 @@ static int match_next_pattern(struct grep_pat *p,
return 1;
}
-static int next_match(struct grep_opt *opt,
- const char *bol, const char *eol,
- enum grep_context ctx, regmatch_t *pmatch, int eflags)
+int grep_next_match(struct grep_opt *opt,
+ const char *bol, const char *eol,
+ enum grep_context ctx, regmatch_t *pmatch,
+ enum grep_header_field field, int eflags)
{
struct grep_pat *p;
int hit = 0;
pmatch->rm_so = pmatch->rm_eo = -1;
if (bol < eol) {
- for (p = opt->pattern_list; p; p = p->next) {
+ for (p = ((ctx == GREP_CONTEXT_HEAD)
+ ? opt->header_list : opt->pattern_list);
+ p; p = p->next) {
switch (p->token) {
- case GREP_PATTERN: /* atom */
case GREP_PATTERN_HEAD:
+ if ((field != GREP_HEADER_FIELD_MAX) &&
+ (p->field != field))
+ continue;
+ /* fall thru */
+ case GREP_PATTERN: /* atom */
case GREP_PATTERN_BODY:
hit |= match_next_pattern(p, bol, eol, ctx,
pmatch, eflags);
@@ -1261,7 +1279,8 @@ static void show_line(struct grep_opt *opt,
else if (sign == '=')
line_color = opt->colors[GREP_COLOR_FUNCTION];
}
- while (next_match(opt, bol, eol, ctx, &match, eflags)) {
+ while (grep_next_match(opt, bol, eol, ctx, &match,
+ GREP_HEADER_FIELD_MAX, eflags)) {
if (match.rm_so == match.rm_eo)
break;
diff --git a/grep.h b/grep.h
index 3cb8a83ae8..808ad76f0c 100644
--- a/grep.h
+++ b/grep.h
@@ -191,6 +191,15 @@ void compile_grep_patterns(struct grep_opt *opt);
void free_grep_patterns(struct grep_opt *opt);
int grep_buffer(struct grep_opt *opt, const char *buf, unsigned long size);
+/* The field parameter is only used to filter header patterns
+ * (where appropriate). If filtering isn't desirable
+ * GREP_HEADER_FIELD_MAX should be supplied.
+ */
+int grep_next_match(struct grep_opt *opt,
+ const char *bol, const char *eol,
+ enum grep_context ctx, regmatch_t *pmatch,
+ enum grep_header_field field, int eflags);
+
struct grep_source {
char *name;
--
2.33.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v12 2/3] pretty: colorize pattern matches in commit messages
2021-10-08 22:49 [PATCH v12 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
@ 2021-10-08 22:49 ` Hamza Mahfooz
2021-10-08 22:49 ` [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
1 sibling, 0 replies; 5+ messages in thread
From: Hamza Mahfooz @ 2021-10-08 22:49 UTC (permalink / raw)
To: git; +Cc: Junio C Hamano, Hamza Mahfooz
The "git log" command limits its output to the commits that contain strings
matched by a pattern when the "--grep=<pattern>" option is used, but unlike
output from "git grep -e <pattern>", the matches are not highlighted,
making them harder to spot.
Teach the pretty-printer code to highlight matches from the
"--grep=<pattern>", "--author=<pattern>" and "--committer=<pattern>"
options (to view the last one, you may have to ask for --pretty=fuller).
Also, it must be noted that we are effectively greping the content twice
(because it would be a hassle to rework the existing matching code to do
a /g match and then pass it all down to the coloring code), however it only
slows down "git log --author=^H" on this repository by around 1-2%
(compared to v2.33.0), so it should be a small enough slow down to justify
the addition of the feature.
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v2: make the commit message whole (add the missing ingredients), rename
append_matched_line() to append_line_with_color(), use
colors[GREP_COLOR_MATCH_SELECTED] instead of
colors[GREP_COLOR_MATCH_CONTEXT], allow the background color to be
customized, don't copy strings to a buffer when not coloring in
append_line_with_color(), rename next_match() to grep_next_match(),
repurpose grep_next_match()/match_one_pattern() for use in
append_line_with_color() (allowing us to remove duplicated matching
code in append_line_with_color()), document how to customize the
feature and modify some of the tests to fit the feature better.
v3: fix a formatting issue with the added documentation.
v4: add strbuf_add_with_color(), use the correct color code scheme in the
unit tests and add more unit tests.
v5: separate grep changes from pretty changes and add some performance
analysis in the commit message.
v6: put the documentation in the correct place, cleanup pretty.c and
format the unit tests according to the current convention.
v7: get rid of all manual strbuf management, constify where now appropriate
and fix the header line prefix issue properly.
v8: remove code that relies on grep_header_fields[].
v11: clarify why we are grepping twice.
---
Documentation/config/color.txt | 7 ++-
pretty.c | 101 +++++++++++++++++++++++++++++----
t/t4202-log.sh | 51 +++++++++++++++++
3 files changed, 145 insertions(+), 14 deletions(-)
diff --git a/Documentation/config/color.txt b/Documentation/config/color.txt
index e05d520a86..91d9a9da32 100644
--- a/Documentation/config/color.txt
+++ b/Documentation/config/color.txt
@@ -104,9 +104,12 @@ color.grep.<slot>::
`matchContext`;;
matching text in context lines
`matchSelected`;;
- matching text in selected lines
+ matching text in selected lines. Also, used to customize the following
+ linkgit:git-log[1] subcommands: `--grep`, `--author` and `--committer`.
`selected`;;
- non-matching text in selected lines
+ non-matching text in selected lines. Also, used to customize the
+ following linkgit:git-log[1] subcommands: `--grep`, `--author` and
+ `--committer`.
`separator`;;
separators between fields on a line (`:`, `-`, and `=`)
and between hunks (`--`)
diff --git a/pretty.c b/pretty.c
index 73b5ead509..2dd94af886 100644
--- a/pretty.c
+++ b/pretty.c
@@ -431,6 +431,52 @@ const char *show_ident_date(const struct ident_split *ident,
return show_date(date, tz, mode);
}
+static inline void strbuf_add_with_color(struct strbuf *sb, const char *color,
+ const char *buf, size_t buflen)
+{
+ strbuf_addstr(sb, color);
+ strbuf_add(sb, buf, buflen);
+ if (*color)
+ strbuf_addstr(sb, GIT_COLOR_RESET);
+}
+
+static void append_line_with_color(struct strbuf *sb, struct grep_opt *opt,
+ const char *line, size_t linelen,
+ int color, enum grep_context ctx,
+ enum grep_header_field field)
+{
+ const char *buf, *eol, *line_color, *match_color;
+ regmatch_t match;
+ int eflags = 0;
+
+ buf = line;
+ eol = buf + linelen;
+
+ if (!opt || !want_color(color) || opt->invert)
+ goto end;
+
+ line_color = opt->colors[GREP_COLOR_SELECTED];
+ match_color = opt->colors[GREP_COLOR_MATCH_SELECTED];
+
+ while (grep_next_match(opt, buf, eol, ctx, &match, field, eflags)) {
+ if (match.rm_so == match.rm_eo)
+ break;
+
+ strbuf_add_with_color(sb, line_color, buf, match.rm_so);
+ strbuf_add_with_color(sb, match_color, buf + match.rm_so,
+ match.rm_eo - match.rm_so);
+ buf += match.rm_eo;
+ eflags = REG_NOTBOL;
+ }
+
+ if (eflags)
+ strbuf_add_with_color(sb, line_color, buf, eol - buf);
+ else {
+end:
+ strbuf_add(sb, buf, eol - buf);
+ }
+}
+
void pp_user_info(struct pretty_print_context *pp,
const char *what, struct strbuf *sb,
const char *line, const char *encoding)
@@ -496,9 +542,26 @@ void pp_user_info(struct pretty_print_context *pp,
strbuf_addch(sb, '\n');
strbuf_addf(sb, " <%.*s>\n", (int)maillen, mailbuf);
} else {
- strbuf_addf(sb, "%s: %.*s%.*s <%.*s>\n", what,
- (pp->fmt == CMIT_FMT_FULLER) ? 4 : 0, " ",
- (int)namelen, namebuf, (int)maillen, mailbuf);
+ struct strbuf id = STRBUF_INIT;
+ enum grep_header_field field = GREP_HEADER_FIELD_MAX;
+ struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
+
+ if (!strcmp(what, "Author"))
+ field = GREP_HEADER_AUTHOR;
+ else if (!strcmp(what, "Commit"))
+ field = GREP_HEADER_COMMITTER;
+
+ strbuf_addf(sb, "%s: ", what);
+ if (pp->fmt == CMIT_FMT_FULLER)
+ strbuf_addchars(sb, ' ', 4);
+
+ strbuf_addf(&id, "%.*s <%.*s>", (int)namelen, namebuf,
+ (int)maillen, mailbuf);
+
+ append_line_with_color(sb, opt, id.buf, id.len, pp->color,
+ GREP_CONTEXT_HEAD, field);
+ strbuf_addch(sb, '\n');
+ strbuf_release(&id);
}
switch (pp->fmt) {
@@ -1939,8 +2002,9 @@ static int pp_utf8_width(const char *start, const char *end)
return width;
}
-static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
- const char *line, int linelen)
+static void strbuf_add_tabexpand(struct strbuf *sb, struct grep_opt *opt,
+ int color, int tabwidth, const char *line,
+ int linelen)
{
const char *tab;
@@ -1957,7 +2021,9 @@ static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
break;
/* Output the data .. */
- strbuf_add(sb, line, tab - line);
+ append_line_with_color(sb, opt, line, tab - line, color,
+ GREP_CONTEXT_BODY,
+ GREP_HEADER_FIELD_MAX);
/* .. and the de-tabified tab */
strbuf_addchars(sb, ' ', tabwidth - (width % tabwidth));
@@ -1972,7 +2038,8 @@ static void strbuf_add_tabexpand(struct strbuf *sb, int tabwidth,
* worrying about width - there's nothing more to
* align.
*/
- strbuf_add(sb, line, linelen);
+ append_line_with_color(sb, opt, line, linelen, color, GREP_CONTEXT_BODY,
+ GREP_HEADER_FIELD_MAX);
}
/*
@@ -1984,11 +2051,16 @@ static void pp_handle_indent(struct pretty_print_context *pp,
struct strbuf *sb, int indent,
const char *line, int linelen)
{
+ struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
+
strbuf_addchars(sb, ' ', indent);
if (pp->expand_tabs_in_log)
- strbuf_add_tabexpand(sb, pp->expand_tabs_in_log, line, linelen);
+ strbuf_add_tabexpand(sb, opt, pp->color, pp->expand_tabs_in_log,
+ line, linelen);
else
- strbuf_add(sb, line, linelen);
+ append_line_with_color(sb, opt, line, linelen, pp->color,
+ GREP_CONTEXT_BODY,
+ GREP_HEADER_FIELD_MAX);
}
static int is_mboxrd_from(const char *line, int len)
@@ -2006,7 +2078,9 @@ void pp_remainder(struct pretty_print_context *pp,
struct strbuf *sb,
int indent)
{
+ struct grep_opt *opt = pp->rev ? &pp->rev->grep_filter : NULL;
int first = 1;
+
for (;;) {
const char *line = *msg_p;
int linelen = get_one_line(line);
@@ -2027,14 +2101,17 @@ void pp_remainder(struct pretty_print_context *pp,
if (indent)
pp_handle_indent(pp, sb, indent, line, linelen);
else if (pp->expand_tabs_in_log)
- strbuf_add_tabexpand(sb, pp->expand_tabs_in_log,
- line, linelen);
+ strbuf_add_tabexpand(sb, opt, pp->color,
+ pp->expand_tabs_in_log, line,
+ linelen);
else {
if (pp->fmt == CMIT_FMT_MBOXRD &&
is_mboxrd_from(line, linelen))
strbuf_addch(sb, '>');
- strbuf_add(sb, line, linelen);
+ append_line_with_color(sb, opt, line, linelen,
+ pp->color, GREP_CONTEXT_BODY,
+ GREP_HEADER_FIELD_MAX);
}
strbuf_addch(sb, '\n');
}
diff --git a/t/t4202-log.sh b/t/t4202-log.sh
index 9dfead936b..3d240bba57 100755
--- a/t/t4202-log.sh
+++ b/t/t4202-log.sh
@@ -449,6 +449,57 @@ test_expect_success !FAIL_PREREQS 'log with various grep.patternType configurati
)
'
+test_expect_success 'log --author' '
+ cat >expect <<-\EOF &&
+ Author: <BOLD;RED>A U<RESET> Thor <author@example.com>
+ EOF
+ git log -1 --color=always --author="A U" >log &&
+ grep Author log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'log --committer' '
+ cat >expect <<-\EOF &&
+ Commit: C O Mitter <committer@<BOLD;RED>example<RESET>.com>
+ EOF
+ git log -1 --color=always --pretty=fuller --committer="example" >log &&
+ grep "Commit:" log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success 'log -i --grep with color' '
+ cat >expect <<-\EOF &&
+ <BOLD;RED>Sec<RESET>ond
+ <BOLD;RED>sec<RESET>ond
+ EOF
+ git log --color=always -i --grep=^sec >log &&
+ grep -i sec log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success '-c color.grep.selected log --grep' '
+ cat >expect <<-\EOF &&
+ <GREEN>th<RESET><BOLD;RED>ir<RESET><GREEN>d<RESET>
+ EOF
+ git -c color.grep.selected="green" log --color=always --grep=ir >log &&
+ grep ir log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expect actual
+'
+
+test_expect_success '-c color.grep.matchSelected log --grep' '
+ cat >expect <<-\EOF &&
+ <BLUE>i<RESET>n<BLUE>i<RESET>t<BLUE>i<RESET>al
+ EOF
+ git -c color.grep.matchSelected="blue" log --color=always --grep=i >log &&
+ grep al log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expect actual
+'
+
cat > expect <<EOF
* Second
* sixth
--
2.33.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
2021-10-08 22:49 [PATCH v12 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
2021-10-08 22:49 ` [PATCH v12 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
@ 2021-10-08 22:49 ` Hamza Mahfooz
2021-10-09 1:37 ` Ævar Arnfjörð Bjarmason
1 sibling, 1 reply; 5+ messages in thread
From: Hamza Mahfooz @ 2021-10-08 22:49 UTC (permalink / raw)
To: git; +Cc: Junio C Hamano, Hamza Mahfooz, Ævar Arnfjörð Bjarmason
If we attempt to grep non-ascii log message text with an ascii pattern, we
run into the following issue:
$ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
grep: (standard input): binary file matches
So, to fix this teach the grep code to mark the pattern as UTF-8 (even if
the pattern is composed of only ascii characters), so long as the log
output is encoded using UTF-8.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
---
v12: get rid of utf8_all_the_things and fix an issue with one of the unit
tests.
---
grep.c | 6 +++--
t/t7812-grep-icase-non-ascii.sh | 48 +++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/grep.c b/grep.c
index fe847a0111..f6e113e9f0 100644
--- a/grep.c
+++ b/grep.c
@@ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
}
options |= PCRE2_CASELESS;
}
- if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
- !(!opt->ignore_case && (p->fixed || p->is_fixed)))
+ if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
+ (!opt->ignore_locale && is_utf8_locale() &&
+ has_non_ascii(p->pattern) && !(!opt->ignore_case &&
+ (p->fixed || p->is_fixed))))
options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
#ifdef GIT_PCRE2_VERSION_10_36_OR_HIGHER
diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh
index e5d1e4ea68..22487d90fd 100755
--- a/t/t7812-grep-icase-non-ascii.sh
+++ b/t/t7812-grep-icase-non-ascii.sh
@@ -53,6 +53,54 @@ test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
test_cmp expected actual
'
+test_expect_success GETTEXT_LOCALE,PCRE 'log --author with an ascii pattern on UTF-8 data' '
+ cat >expected <<-\EOF &&
+ Author: <BOLD;RED>À Ú Thor<RESET> <author@example.com>
+ EOF
+ test_write_lines "forth" >file4 &&
+ git add file4 &&
+ git commit --author="À Ú Thor <author@example.com>" -m sécond &&
+ git log -1 --color=always --perl-regexp --author=".*Thor" >log &&
+ grep Author log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --committer with an ascii pattern on ISO-8859-1 data' '
+ cat >expected <<-\EOF &&
+ Commit: Ç<BOLD;RED> O Mîtter <committer@example.com><RESET>
+ EOF
+ test_write_lines "fifth" >file5 &&
+ git add file5 &&
+ GIT_COMMITTER_NAME="Ç O Mîtter" &&
+ GIT_COMMITTER_EMAIL="committer@example.com" &&
+ git -c i18n.commitEncoding=latin1 commit -m thïrd &&
+ git -c i18n.logOutputEncoding=latin1 log -1 --pretty=fuller --color=always --perl-regexp --committer=" O.*" >log &&
+ grep Commit: log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on UTF-8 data' '
+ cat >expected <<-\EOF &&
+ sé<BOLD;RED>con<RESET>d
+ EOF
+ git log -1 --color=always --perl-regexp --grep="con" >log &&
+ grep con log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expected actual
+'
+
+test_expect_success GETTEXT_LOCALE,PCRE 'log --grep with an ascii pattern on ISO-8859-1 data' '
+ cat >expected <<-\EOF &&
+ <BOLD;RED>thïrd<RESET>
+ EOF
+ git -c i18n.logOutputEncoding=latin1 log -1 --color=always --perl-regexp --grep="th.*rd" >log &&
+ grep "th.*rd" log >actual.raw &&
+ test_decode_color <actual.raw >actual &&
+ test_cmp expected actual
+'
+
test_expect_success GETTEXT_LOCALE,LIBPCRE2 'PCRE v2: setup invalid UTF-8 data' '
printf "\\200\\n" >invalid-0x80 &&
echo "ævar" >expected &&
--
2.33.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
2021-10-08 22:49 ` [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
@ 2021-10-09 1:37 ` Ævar Arnfjörð Bjarmason
2021-10-12 23:01 ` Hamza Mahfooz
0 siblings, 1 reply; 5+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2021-10-09 1:37 UTC (permalink / raw)
To: Hamza Mahfooz; +Cc: git, Junio C Hamano
On Fri, Oct 08 2021, Hamza Mahfooz wrote:
> If we attempt to grep non-ascii log message text with an ascii pattern, we
> run into the following issue:
>
> $ git log --color --author='.var.*Bjar' -1 origin/master | grep ^Author
> grep: (standard input): binary file matches
>
> So, to fix this teach the grep code to mark the pattern as UTF-8 (even if
> the pattern is composed of only ascii characters), so long as the log
> output is encoded using UTF-8.
>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Signed-off-by: Hamza Mahfooz <someguy@effective-light.com>
> ---
> v12: get rid of utf8_all_the_things and fix an issue with one of the unit
> tests.
> ---
> grep.c | 6 +++--
> t/t7812-grep-icase-non-ascii.sh | 48 +++++++++++++++++++++++++++++++++
> 2 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/grep.c b/grep.c
> index fe847a0111..f6e113e9f0 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -382,8 +382,10 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
> }
> options |= PCRE2_CASELESS;
> }
> - if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
> - !(!opt->ignore_case && (p->fixed || p->is_fixed)))
> + if ((!opt->ignore_locale && !has_non_ascii(p->pattern)) ||
> + (!opt->ignore_locale && is_utf8_locale() &&
> + has_non_ascii(p->pattern) && !(!opt->ignore_case &&
> + (p->fixed || p->is_fixed))))
> options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
I think at least some of that existing "if" is my fault, and I *think*
your patch works here, but FWIW I'd find something like this way more
readable:
@@ -382,8 +383,16 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
}
options |= PCRE2_CASELESS;
}
+ if (opt->utf8_all_the_things) {
+ options |= PCRE2_UCP;
+ do_utf8 = 1;
+ }
+
if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern) &&
!(!opt->ignore_case && (p->fixed || p->is_fixed)))
+ do_utf8 = 1;
+
+ if (do_utf8)
options |= (PCRE2_UTF | PCRE2_MATCH_INVALID_UTF);
Well, without the "utf8_all_the_things" probably. That's a reference to
a popular meme, probably better to name it differently, and the
PCRE2_UCP is just something I was experimenting with.
It's late here, but I've got to admit that I'm still a bit confused by
this. Let's see if I can try to sum up why:
Ultimately whether we use PCRE2_UTF *should* have nothing do to with
whether the pattern is UTF-8 or not, because even an expression like:
/.*/
Will behave differently under UTF-8, i.e. character classes change, byte
boundaries change to "character" boundaries etc.
That the existing code has has_non_ascii() and the like is a trade-off
that had to be made for the grep-a-file case, because you might run into
arbitrary binary data, but logs are cleaner/encoded/re-encoded etc.
If you run PCRE in UTF-8 mode it will die on some of that data (as
you'll see from our test suite if you turn it on unconditionally).
Are there cases where my "utf8_all_the_things" POC patch would have
turned on PCRE2_UTF, but yours doesn't? IOW is there a 1=1 mapping still
between the encoding mode log/revision.c thinks it's in & PCRE2_UTF?
Anyway, I've tried to break things with this patch and haven't
succeeded, so maybe it's all OK now, thanks a lot for working on this
again, it's a really neat feature. Just wondering if there's any
remaining edge cases we know about...
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data
2021-10-09 1:37 ` Ævar Arnfjörð Bjarmason
@ 2021-10-12 23:01 ` Hamza Mahfooz
0 siblings, 0 replies; 5+ messages in thread
From: Hamza Mahfooz @ 2021-10-12 23:01 UTC (permalink / raw)
To: Ævar Arnfjörð Bjarmason; +Cc: git, Junio C Hamano
On Sat, Oct 9 2021 at 03:37:58 AM +0200, Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
> Are there cases where my "utf8_all_the_things" POC patch would have
> turned on PCRE2_UTF, but yours doesn't? IOW is there a 1=1 mapping
> still
> between the encoding mode log/revision.c thinks it's in & PCRE2_UTF?
As far as I can tell, the only place where ignore_locale is mutated is
in
setup_revisions(), so there should be a 1=1 mapping.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2021-10-12 23:01 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-08 22:49 [PATCH v12 1/3] grep: refactor next_match() and match_one_pattern() for external use Hamza Mahfooz
2021-10-08 22:49 ` [PATCH v12 2/3] pretty: colorize pattern matches in commit messages Hamza Mahfooz
2021-10-08 22:49 ` [PATCH v12 3/3] grep: fix an edge case concerning ascii patterns and UTF-8 data Hamza Mahfooz
2021-10-09 1:37 ` Ævar Arnfjörð Bjarmason
2021-10-12 23:01 ` Hamza Mahfooz
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).