git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: Ryan Zoeller <rtzoeller@rtzoeller.com>
To: git@vger.kernel.org
Cc: Ryan Zoeller <rtzoeller@rtzoeller.com>
Subject: [RFC 1/1] xdiff: use leading whitespace in function heuristic
Date: Wed, 23 Sep 2020 21:59:19 +0000	[thread overview]
Message-ID: <20200923215859.102981-2-rtzoeller@rtzoeller.com> (raw)
In-Reply-To: <20200923215859.102981-1-rtzoeller@rtzoeller.com>

The regular expressions specified in userdiff.c, as well as user-defined
expressions, allow git to detect which lines of code which declare functions
(as well as other notable lines, such as class declarations).

Although useful, these regular expressions can't identify which function
a line of code belongs to, only the closest function to it.
Languages which allow for nested functions -- or functions inside of
classes -- can trip this mechanism up.

Since many languages use indentation to associate lines of code with a
function (either semantically or cosmetically), we can use indentation
as an additional heuristic for identifying the owning function.

Specifically, this assumes code belongs to a function which is less
indented than it.

Signed-off-by: Ryan Zoeller <rtzoeller@rtzoeller.com>
---
 grep.c            |  2 +-
 line-range.c      |  2 +-
 xdiff-interface.c | 14 +++++++++++++-
 xdiff/xdiff.h     |  2 +-
 xdiff/xemit.c     | 23 +++++++++++++++++------
 5 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/grep.c b/grep.c
index 54af9f813e..3281f19977 100644
--- a/grep.c
+++ b/grep.c
@@ -1555,7 +1555,7 @@ static int match_funcname(struct grep_opt *opt, struct grep_source *gs, char *bo
 
 	if (xecfg) {
 		char buf[1];
-		return xecfg->find_func(bol, eol - bol, buf, 1,
+		return xecfg->find_func(bol, eol - bol, buf, 1, -1,
 					xecfg->find_func_priv) >= 0;
 	}
 
diff --git a/line-range.c b/line-range.c
index 9b50583dc0..eb9540bc76 100644
--- a/line-range.c
+++ b/line-range.c
@@ -119,7 +119,7 @@ static int match_funcname(xdemitconf_t *xecfg, const char *bol, const char *eol)
 {
 	if (xecfg) {
 		char buf[1];
-		return xecfg->find_func(bol, eol - bol, buf, 1,
+		return xecfg->find_func(bol, eol - bol, buf, 1, -1,
 					xecfg->find_func_priv) >= 0;
 	}
 
diff --git a/xdiff-interface.c b/xdiff-interface.c
index 4d20069302..d93cb5c72e 100644
--- a/xdiff-interface.c
+++ b/xdiff-interface.c
@@ -201,7 +201,7 @@ struct ff_regs {
 };
 
 static long ff_regexp(const char *line, long len,
-		char *buffer, long buffer_size, void *priv)
+		char *buffer, long buffer_size, long max_leading_spaces, void *priv)
 {
 	struct ff_regs *regs = priv;
 	regmatch_t pmatch[2];
@@ -216,6 +216,18 @@ static long ff_regexp(const char *line, long len,
 			len--;
 	}
 
+	// TODO: Is it faster to check whitespace only after matching the regex?
+	if (max_leading_spaces >= 0) {
+		long leading_spaces;
+		for (leading_spaces = 0; leading_spaces < len
+				&& leading_spaces <= max_leading_spaces
+				&& isspace(line[leading_spaces]); leading_spaces++)
+			;
+
+		if (leading_spaces > max_leading_spaces)
+			return -1;
+	}
+
 	for (i = 0; i < regs->nr; i++) {
 		struct ff_reg *reg = regs->array + i;
 		if (!regexec_buf(&reg->re, line, len, 2, pmatch, 0)) {
diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h
index 032e3a9f41..f78c30c527 100644
--- a/xdiff/xdiff.h
+++ b/xdiff/xdiff.h
@@ -93,7 +93,7 @@ typedef struct s_xdemitcb {
 	int (*out_line)(void *, mmbuffer_t *, int);
 } xdemitcb_t;
 
-typedef long (*find_func_t)(const char *line, long line_len, char *buffer, long buffer_size, void *priv);
+typedef long (*find_func_t)(const char *line, long line_len, char *buffer, long buffer_size, long max_leading_spaces, void *priv);
 
 typedef int (*xdl_emit_hunk_consume_func_t)(long start_a, long count_a,
 					    long start_b, long count_b,
diff --git a/xdiff/xemit.c b/xdiff/xemit.c
index 9d7d6c5087..1de68008f9 100644
--- a/xdiff/xemit.c
+++ b/xdiff/xemit.c
@@ -95,7 +95,7 @@ xdchange_t *xdl_get_hunk(xdchange_t **xscr, xdemitconf_t const *xecfg)
 }
 
 
-static long def_ff(const char *rec, long len, char *buf, long sz, void *priv)
+static long def_ff(const char *rec, long len, char *buf, long sz, long max_leading_spaces, void *priv)
 {
 	if (len > 0 &&
 			(isalpha((unsigned char)*rec) || /* identifier? */
@@ -112,19 +112,19 @@ static long def_ff(const char *rec, long len, char *buf, long sz, void *priv)
 }
 
 static long match_func_rec(xdfile_t *xdf, xdemitconf_t const *xecfg, long ri,
-			   char *buf, long sz)
+			   char *buf, long sz, long max_leading_spaces)
 {
 	const char *rec;
 	long len = xdl_get_rec(xdf, ri, &rec);
 	if (!xecfg->find_func)
-		return def_ff(rec, len, buf, sz, xecfg->find_func_priv);
-	return xecfg->find_func(rec, len, buf, sz, xecfg->find_func_priv);
+		return def_ff(rec, len, buf, sz, max_leading_spaces, xecfg->find_func_priv);
+	return xecfg->find_func(rec, len, buf, sz, max_leading_spaces, xecfg->find_func_priv);
 }
 
 static int is_func_rec(xdfile_t *xdf, xdemitconf_t const *xecfg, long ri)
 {
 	char dummy[1];
-	return match_func_rec(xdf, xecfg, ri, dummy, sizeof(dummy)) >= 0;
+	return match_func_rec(xdf, xecfg, ri, dummy, -1, sizeof(dummy)) >= 0;
 }
 
 struct func_line {
@@ -137,12 +137,23 @@ static long get_func_line(xdfenv_t *xe, xdemitconf_t const *xecfg,
 {
 	long l, size, step = (start > limit) ? -1 : 1;
 	char *buf, dummy[1];
+	long leading_spaces;
+
+	if (start - step >= 0 && start - step < xe->xdf1.nrec) {
+		xrecord_t *first_line = xe->xdf1.recs[start - step];
+
+		for (leading_spaces = 0; first_line->ptr[leading_spaces]
+				&& isspace(first_line->ptr[leading_spaces]); leading_spaces++)
+			;
+	} else {
+		leading_spaces = 0;
+	}
 
 	buf = func_line ? func_line->buf : dummy;
 	size = func_line ? sizeof(func_line->buf) : sizeof(dummy);
 
 	for (l = start; l != limit && 0 <= l && l < xe->xdf1.nrec; l += step) {
-		long len = match_func_rec(&xe->xdf1, xecfg, l, buf, size);
+		long len = match_func_rec(&xe->xdf1, xecfg, l, buf, size, leading_spaces - 1);
 		if (len >= 0) {
 			if (func_line)
 				func_line->len = len;
-- 
2.28.0.586.g47c91ef7fe



  reply	other threads:[~2020-09-23 21:59 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-23 21:59 [RFC 0/1] Leading whitespace as a function identification heuristic? Ryan Zoeller
2020-09-23 21:59 ` Ryan Zoeller [this message]
2020-09-24  6:45 ` Junio C Hamano
2020-09-24 21:17   ` Jeff King
2020-09-24 22:01     ` Ryan Zoeller
2020-09-25  9:11       ` Phillip Wood
2020-09-25 18:43         ` Jeff King
2020-09-25 19:01           ` Phillip Wood
2020-09-25 19:05             ` Jeff King
2020-09-25 18:12 ` Johannes Sixt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200923215859.102981-2-rtzoeller@rtzoeller.com \
    --to=rtzoeller@rtzoeller.com \
    --cc=git@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).