git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Jonathan Tan <jonathantanmy@google.com>
To: git@vger.kernel.org
Cc: Jonathan Tan <jonathantanmy@google.com>,
	peff@peff.net, gitster@pobox.com
Subject: [RFC/PATCH 1/3] mailinfo: refactor commit message processing
Date: Fri, 16 Sep 2016 10:37:22 -0700	[thread overview]
Message-ID: <7dbb4bc0659056211b27f0033c73f0d558efdb54.1474047135.git.jonathantanmy@google.com> (raw)
In-Reply-To: <cover.1474047135.git.jonathantanmy@google.com>
In-Reply-To: <cover.1474047135.git.jonathantanmy@google.com>

Within the processing of the commit message, check for a scissors line
or a patchbreak line first (before checking for in-body headers) so that
a subsequent patch modifying the processing of in-body headers would not
cause a scissors line or patchbreak line to be misidentified.

If a line could be both an in-body header and a scissors line (for
example, "From: -- >8 --"), this is considered a fatal error
(previously, it would be interpreted as an in-body header).  (It is not
possible for a line to be both an in-body header and a patchbreak line,
since both require different prefixes.)

The following enumeration shows that processing is the same except (as
described above) the in-body header + scissors line case.

o in-body header (check_header OK)
  o passes UTF-8 conversion
    o [described above] is scissors line
    o [not possible] is patchbreak line
    o [not possible] is blank line
    o is none of the above - processed as header
  o fails UTF-8 conversion - processed as header
o not in-body header
  o passes UTF-8 conversion
    o is scissors line - processed as scissors
    o is patchbreak line - processed as patchbreak
    o is blank line - ignored if in header_stage
    o is none of the above - log message
  o fails UTF-8 conversion - input error

As for the result left in "line" (after the invocation of
handle_commit_msg), it is unused (by its caller, handle_filter, and by
handle_filter's callers, handle_boundary and handle_body) unless this
line is a patchbreak line, in which case handle_patch is subsequently
called (in handle_filter) on "line". In this case, "line" must have
passed UTF-8 conversion both before and after this patch, so the result
is still the same overall.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 mailinfo.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 115 insertions(+), 30 deletions(-)

diff --git a/mailinfo.c b/mailinfo.c
index e19abe3..23a56c2 100644
--- a/mailinfo.c
+++ b/mailinfo.c
@@ -340,23 +340,56 @@ static struct strbuf *decode_b_segment(const struct strbuf *b_seg)
 	return out;
 }
 
-static int convert_to_utf8(struct mailinfo *mi,
-			   struct strbuf *line, const char *charset)
+/*
+ * Attempts to convert line into UTF-8, storing the result in line.
+ *
+ * This differs from convert_to_utf8 in that conversion non-success is not
+ * considered an error case - mi->input_error is not set, and no error message
+ * is printed.
+ *
+ * If the conversion is unnecessary, returns 0 and stores NULL in old_buf (if
+ * old_buf is not NULL).
+ *
+ * If the conversion is successful, returns 0 and stores the unconverted string
+ * in old_buf and old_len (if they are respectively not NULL).
+ *
+ * If the conversion is unsuccessful, returns -1.
+ */
+static int try_convert_to_utf8(const struct mailinfo *mi, struct strbuf *line,
+			       const char *charset, char **old_buf,
+			       size_t *old_len)
 {
-	char *out;
+	char *utf8;
 
-	if (!mi->metainfo_charset || !charset || !*charset)
+	if (!mi->metainfo_charset || !charset || !*charset ||
+	    same_encoding(mi->metainfo_charset, charset)) {
+		if (old_buf)
+			*old_buf = NULL;
 		return 0;
+	}
 
-	if (same_encoding(mi->metainfo_charset, charset))
+	utf8 = reencode_string(line->buf, mi->metainfo_charset, charset);
+	if (utf8) {
+		char *temp = strbuf_detach(line, old_len);
+		if (old_buf)
+			*old_buf = temp;
+		strbuf_attach(line, utf8, strlen(utf8), strlen(utf8));
 		return 0;
-	out = reencode_string(line->buf, mi->metainfo_charset, charset);
-	if (!out) {
+	}
+	return -1;
+}
+
+/*
+ * Converts line into UTF-8, setting mi->input_error to -1 upon failure.
+ */
+static int convert_to_utf8(struct mailinfo *mi,
+			   struct strbuf *line, const char *charset)
+{
+	if (try_convert_to_utf8(mi, line, charset, NULL, NULL)) {
 		mi->input_error = -1;
 		return error("cannot convert from %s to %s",
 			     charset, mi->metainfo_charset);
 	}
-	strbuf_attach(line, out, strlen(out), strlen(out));
 	return 0;
 }
 
@@ -515,6 +548,13 @@ static int check_header(struct mailinfo *mi,
 	return ret;
 }
 
+static int check_header_raw(struct mailinfo *mi,
+			    char *buf, size_t len,
+			    struct strbuf *hdr_data[], int overwrite) {
+	const struct strbuf sb = {0, len, buf};
+	return check_header(mi, &sb, hdr_data, overwrite);
+}
+
 static void decode_transfer_encoding(struct mailinfo *mi, struct strbuf *line)
 {
 	struct strbuf *ret;
@@ -623,32 +663,48 @@ static int is_scissors_line(const struct strbuf *line)
 		gap * 2 < perforation);
 }
 
-static int handle_commit_msg(struct mailinfo *mi, struct strbuf *line)
+static int resembles_rfc2822_header(const struct strbuf *line)
 {
-	assert(!mi->filter_stage);
+	char *c;
 
-	if (mi->header_stage) {
-		if (!line->len || (line->len == 1 && line->buf[0] == '\n'))
+	if (!isalpha(line->buf[0]))
+		return 0;
+
+	for (c = line->buf + 1; *c != 0; c++) {
+		if (*c == ':')
+			return 1;
+		else if (*c != '-' && !isalpha(*c))
 			return 0;
 	}
+	return 0;
+}
 
-	if (mi->use_inbody_headers && mi->header_stage) {
-		mi->header_stage = check_header(mi, line, mi->s_hdr_data, 0);
-		if (mi->header_stage)
-			return 0;
-	} else
-		/* Only trim the first (blank) line of the commit message
-		 * when ignoring in-body headers.
-		 */
-		mi->header_stage = 0;
+static int handle_commit_msg(struct mailinfo *mi, struct strbuf *line)
+{
+	int ret = 0;
+	int utf8_result;
+	char *old_buf;
+	size_t old_len;
+
+	assert(!mi->filter_stage);
 
-	/* normalize the log message to UTF-8. */
-	if (convert_to_utf8(mi, line, mi->charset.buf))
-		return 0; /* mi->input_error already set */
+	/*
+	 * Obtain UTF8 for scissors line and patchbreak checks, but retain the
+	 * undecoded line in case we need to process it as an in-body header.
+	 */
+	utf8_result = try_convert_to_utf8(mi, line, mi->charset.buf, &old_buf,
+					  &old_len);
 
-	if (mi->use_scissors && is_scissors_line(line)) {
+	if (!utf8_result && mi->use_scissors && is_scissors_line(line)) {
 		int i;
 
+		if (resembles_rfc2822_header(line))
+			/*
+			 * Explicitly reject scissor lines that resemble a RFC
+			 * 2822 header, to avoid being prone to error.
+			 */
+			die("scissors line resembles RFC 2822 header");
+
 		strbuf_setlen(&mi->log_message, 0);
 		mi->header_stage = 1;
 
@@ -661,18 +717,47 @@ static int handle_commit_msg(struct mailinfo *mi, struct strbuf *line)
 				strbuf_release(mi->s_hdr_data[i]);
 			mi->s_hdr_data[i] = NULL;
 		}
-		return 0;
+		goto handle_commit_msg_out;
 	}
-
-	if (patchbreak(line)) {
+	if (!utf8_result && patchbreak(line)) {
 		if (mi->message_id)
 			strbuf_addf(&mi->log_message,
 				    "Message-Id: %s\n", mi->message_id);
-		return 1;
+		ret = 1;
+		goto handle_commit_msg_out;
 	}
 
+	if (mi->header_stage) {
+		char *buf = old_buf ? old_buf : line->buf;
+		if (buf[0] == 0 || (buf[0] == '\n' && buf[1] == 0))
+			goto handle_commit_msg_out;
+	}
+
+	if (mi->use_inbody_headers && mi->header_stage) {
+		char *buf = old_buf ? old_buf : line->buf;
+		size_t len = old_buf ? old_len : line->len;
+		mi->header_stage = check_header_raw(mi, buf, len,
+						    mi->s_hdr_data, 0);
+		if (mi->header_stage)
+			goto handle_commit_msg_out;
+	} else
+		/* Only trim the first (blank) line of the commit message
+		 * when ignoring in-body headers.
+		 */
+		mi->header_stage = 0;
+
+	/* If adding as a log message, conversion to UTF-8 is required. */
+	if (utf8_result) {
+		mi->input_error = -1;
+		error("cannot convert from %s to %s",
+		      mi->charset.buf, mi->metainfo_charset);
+		goto handle_commit_msg_out;
+	}
 	strbuf_addbuf(&mi->log_message, line);
-	return 0;
+
+handle_commit_msg_out:
+	free(old_buf);
+	return ret;
 }
 
 static void handle_patch(struct mailinfo *mi, const struct strbuf *line)
-- 
2.10.0.rc2.20.g5b18e70


  parent reply	other threads:[~2016-09-16 17:37 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-02 19:58 [PATCH] sequencer: support folding in rfc2822 footer Jonathan Tan
2016-09-03  2:23 ` Junio C Hamano
2016-09-06 22:08   ` Jonathan Tan
2016-09-06 23:30     ` Jonathan Tan
2016-09-07  6:38       ` Jeff King
2016-09-16 17:37         ` [RFC/PATCH 0/3] handle multiline in-body headers Jonathan Tan
2016-09-16 18:29           ` Junio C Hamano
2016-09-16 17:37         ` Jonathan Tan [this message]
2016-09-16 19:12           ` [RFC/PATCH 1/3] mailinfo: refactor commit message processing Junio C Hamano
2016-09-16 21:46             ` Jeff King
2016-09-16 17:37         ` [RFC/PATCH 2/3] mailinfo: correct malformed test example Jonathan Tan
2016-09-16 19:19           ` Junio C Hamano
2016-09-16 22:42             ` Jonathan Tan
2016-09-16 22:55               ` Junio C Hamano
2016-09-17  0:31                 ` Jonathan Tan
2016-09-17  3:48                   ` Junio C Hamano
2016-09-16 17:37         ` [RFC/PATCH 3/3] mailinfo: handle in-body header continuations Jonathan Tan
2016-09-16 20:17           ` Junio C Hamano
2016-09-16 20:49             ` Jonathan Tan
2016-09-16 20:59               ` Junio C Hamano
2016-09-16 22:36                 ` Jonathan Tan
2016-09-16 23:04                   ` Junio C Hamano
2016-09-17  0:22                     ` Jonathan Tan
2016-09-16 21:51           ` Jeff King

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7dbb4bc0659056211b27f0033c73f0d558efdb54.1474047135.git.jonathantanmy@google.com \
    --to=jonathantanmy@google.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=peff@peff.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).