From: lars.schneider@autodesk.com
To: git@vger.kernel.org
Cc: gitster@pobox.com, tboegi@web.de, j6t@kdbg.org,
sunshine@sunshineco.com, peff@peff.net,
ramsay@ramsayjones.plus.com, Johannes.Schindelin@gmx.de,
pclouds@gmail.com, Lars Schneider <larsxschneider@gmail.com>
Subject: [PATCH v11 07/10] convert: check for detectable errors in UTF encodings
Date: Fri, 9 Mar 2018 18:35:33 +0100 [thread overview]
Message-ID: <20180309173536.62012-8-lars.schneider@autodesk.com> (raw)
In-Reply-To: <20180309173536.62012-1-lars.schneider@autodesk.com>
From: Lars Schneider <larsxschneider@gmail.com>
Check that new content is valid with respect to the user defined
'working-tree-encoding' attribute.
Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
---
convert.c | 48 ++++++++++++++++++++++++++++++++++
t/t0028-working-tree-encoding.sh | 56 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+)
diff --git a/convert.c b/convert.c
index aa59ecfe49..b80d666a6b 100644
--- a/convert.c
+++ b/convert.c
@@ -266,6 +266,51 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,
}
+static int validate_encoding(const char *path, const char *enc,
+ const char *data, size_t len, int die_on_error)
+{
+ /* We only check for UTF here as UTF?? can be an alias for UTF-?? */
+ if (istarts_with(enc, "UTF")) {
+ /*
+ * Check for detectable errors in UTF encodings
+ */
+ if (has_prohibited_utf_bom(enc, data, len)) {
+ const char *error_msg = _(
+ "BOM is prohibited in '%s' if encoded as %s");
+ /*
+ * This advice is shown for UTF-??BE and UTF-??LE encodings.
+ */
+ const char *advise_msg = _(
+ "The file '%s' contains a byte order "
+ "mark (BOM). Please use %.6s as "
+ "working-tree-encoding.");
+ advise(advise_msg, path, enc);
+ if (die_on_error)
+ die(error_msg, path, enc);
+ else {
+ return error(error_msg, path, enc);
+ }
+
+ } else if (is_missing_required_utf_bom(enc, data, len)) {
+ const char *error_msg = _(
+ "BOM is required in '%s' if encoded as %s");
+ const char *advise_msg = _(
+ "The file '%s' is missing a byte order "
+ "mark (BOM). Please use %sBE or %sLE "
+ "(depending on the byte order) as "
+ "working-tree-encoding.");
+ advise(advise_msg, path, enc, enc);
+ if (die_on_error)
+ die(error_msg, path, enc);
+ else {
+ return error(error_msg, path, enc);
+ }
+ }
+
+ }
+ return 0;
+}
+
static const char *default_encoding = "UTF-8";
static int encode_to_git(const char *path, const char *src, size_t src_len,
@@ -291,6 +336,9 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
if (!buf && !src)
return 1;
+ if (validate_encoding(path, enc, src, src_len, die_on_error))
+ return 0;
+
dst = reencode_string_len(src, src_len, default_encoding, enc,
&dst_len);
if (!dst) {
diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh
index e492945a01..e8408dfe5c 100755
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh
@@ -62,6 +62,46 @@ test_expect_success 'check $GIT_DIR/info/attributes support' '
for i in 16 32
do
+ test_expect_success "check prohibited UTF-${i} BOM" '
+ test_when_finished "git reset --hard HEAD" &&
+
+ echo "*.utf${i}be text working-tree-encoding=utf-${i}be" >>.gitattributes &&
+ echo "*.utf${i}le text working-tree-encoding=utf-${i}LE" >>.gitattributes &&
+
+ # Here we add a UTF-16 (resp. UTF-32) files with BOM (big/little-endian)
+ # but we tell Git to treat it as UTF-16BE/UTF-16LE (resp. UTF-32).
+ # In these cases the BOM is prohibited.
+ cp bebom.utf${i}be.raw bebom.utf${i}be &&
+ test_must_fail git add bebom.utf${i}be 2>err.out &&
+ test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out &&
+
+ cp lebom.utf${i}le.raw lebom.utf${i}be &&
+ test_must_fail git add lebom.utf${i}be 2>err.out &&
+ test_i18ngrep "fatal: BOM is prohibited .* utf-${i}be" err.out &&
+
+ cp bebom.utf${i}be.raw bebom.utf${i}le &&
+ test_must_fail git add bebom.utf${i}le 2>err.out &&
+ test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out &&
+
+ cp lebom.utf${i}le.raw lebom.utf${i}le &&
+ test_must_fail git add lebom.utf${i}le 2>err.out &&
+ test_i18ngrep "fatal: BOM is prohibited .* utf-${i}LE" err.out
+ '
+
+ test_expect_success "check required UTF-${i} BOM" '
+ test_when_finished "git reset --hard HEAD" &&
+
+ echo "*.utf${i} text working-tree-encoding=utf-${i}" >>.gitattributes &&
+
+ cp nobom.utf${i}be.raw nobom.utf${i} &&
+ test_must_fail git add nobom.utf${i} 2>err.out &&
+ test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out &&
+
+ cp nobom.utf${i}le.raw nobom.utf${i} &&
+ test_must_fail git add nobom.utf${i} 2>err.out &&
+ test_i18ngrep "fatal: BOM is required .* utf-${i}" err.out
+ '
+
test_expect_success "eol conversion for UTF-${i} encoded files on checkout" '
test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
test_when_finished "git reset --hard HEAD^" &&
@@ -141,4 +181,20 @@ test_expect_success 'error if encoding round trip is not the same during refresh
test_i18ngrep "error: .* overwritten by checkout:" err.out
'
+test_expect_success 'error if encoding garbage is already in Git' '
+ BEFORE_STATE=$(git rev-parse HEAD) &&
+ test_when_finished "git reset --hard $BEFORE_STATE" &&
+
+ # Skip the UTF-16 filter for the added file
+ # This simulates a Git version that has no checkoutEncoding support
+ cp nobom.utf16be.raw nonsense.utf16 &&
+ TEST_HASH=$(git hash-object --no-filters -w nonsense.utf16) &&
+ git update-index --add --cacheinfo 100644 $TEST_HASH nonsense.utf16 &&
+ COMMIT=$(git commit-tree -p $(git rev-parse HEAD) -m "plain commit" $(git write-tree)) &&
+ git update-ref refs/heads/master $COMMIT &&
+
+ git diff 2>err.out &&
+ test_i18ngrep "error: BOM is required" err.out
+'
+
test_done
--
2.16.2
next prev parent reply other threads:[~2018-03-09 17:37 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-03-09 17:35 [PATCH v11 00/10] convert: add support for different encodings lars.schneider
2018-03-09 17:35 ` [PATCH v11 01/10] strbuf: remove unnecessary NUL assignment in xstrdup_tolower() lars.schneider
2018-03-09 17:35 ` [PATCH v11 02/10] strbuf: add xstrdup_toupper() lars.schneider
2018-03-09 17:35 ` [PATCH v11 03/10] strbuf: add a case insensitive starts_with() lars.schneider
2018-03-09 17:35 ` [PATCH v11 04/10] utf8: add function to detect prohibited UTF-16/32 BOM lars.schneider
2018-03-09 17:35 ` [PATCH v11 05/10] utf8: add function to detect a missing " lars.schneider
2018-03-09 17:35 ` [PATCH v11 06/10] convert: add 'working-tree-encoding' attribute lars.schneider
2018-03-09 19:10 ` Junio C Hamano
2018-03-15 21:23 ` Lars Schneider
2018-03-18 7:24 ` Torsten Bögershausen
2018-04-01 13:24 ` Lars Schneider
2018-04-05 16:41 ` Torsten Bögershausen
2018-04-15 16:54 ` Lars Schneider
2018-03-09 17:35 ` lars.schneider [this message]
2018-03-09 19:00 ` [PATCH v11 07/10] convert: check for detectable errors in UTF encodings Junio C Hamano
2018-03-09 19:04 ` Lars Schneider
2018-03-09 19:10 ` Junio C Hamano
2018-03-09 17:35 ` [PATCH v11 08/10] convert: advise canonical UTF encoding names lars.schneider
2018-03-09 19:11 ` Junio C Hamano
2018-03-15 22:42 ` Lars Schneider
2018-03-09 17:35 ` [PATCH v11 09/10] convert: add tracing for 'working-tree-encoding' attribute lars.schneider
2018-03-09 17:35 ` [PATCH v11 10/10] convert: add round trip check based on 'core.checkRoundtripEncoding' lars.schneider
2018-03-09 20:18 ` Eric Sunshine
2018-03-09 20:22 ` Junio C Hamano
2018-03-09 20:27 ` Eric Sunshine
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180309173536.62012-8-lars.schneider@autodesk.com \
--to=lars.schneider@autodesk.com \
--cc=Johannes.Schindelin@gmx.de \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
--cc=j6t@kdbg.org \
--cc=larsxschneider@gmail.com \
--cc=pclouds@gmail.com \
--cc=peff@peff.net \
--cc=ramsay@ramsayjones.plus.com \
--cc=sunshine@sunshineco.com \
--cc=tboegi@web.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).