From: tboegi@web.de
To: git@vger.kernel.org
Cc: "Torsten Bögershausen" <tboegi@web.de>
Subject: [PATCH v9 2/6] convert.c: stream and early out
Date: Sat, 7 May 2016 08:11:00 +0200 [thread overview]
Message-ID: <1462601460-23543-1-git-send-email-tboegi@web.de> (raw)
In-Reply-To: <xmqqegblor2l.fsf@gitster.mtv.corp.google.com>
From: Torsten Bögershausen <tboegi@web.de>
When statistics are done for the autocrlf handling, the search in
the content can be stopped, if e.g
- a search for binary is done, and a NUL character is found
- a search for CRLF is done, and the first CRLF is found.
Similar when statistics for binary vs non-binary are gathered:
Whenever a lone CR or NUL is found, the search can be aborted.
When checking out files in "auto" mode, any file that has a "lone CR"
or a CRLF will not be converted, so the search can be aborted early.
Add the new bit, CONVERT_STAT_BITS_ANY_CR,
which is set for either lone CR or CRLF.
Many binary files have a NUL very early (within the first few bytes,
latest within the first 1..2K).
It is often not necessary to load the whole content of a file or blob
into memory.
Use a streaming handling for blobs and files in the worktree.
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
convert.c | 159 ++++++++++++++++++++++++++++++++++++++++----------------------
1 file changed, 103 insertions(+), 56 deletions(-)
diff --git a/convert.c b/convert.c
index b1614bf..24ab095 100644
--- a/convert.c
+++ b/convert.c
@@ -3,6 +3,7 @@
#include "run-command.h"
#include "quote.h"
#include "sigchain.h"
+#include "streaming.h"
/*
* convert.c - convert a file when checking it out and checking it in.
@@ -13,10 +14,10 @@
* translation when the "text" attribute or "auto_crlf" option is set.
*/
-/* Stat bits: When BIN is set, the txt bits are unset */
#define CONVERT_STAT_BITS_TXT_LF 0x1
#define CONVERT_STAT_BITS_TXT_CRLF 0x2
#define CONVERT_STAT_BITS_BIN 0x4
+#define CONVERT_STAT_BITS_ANY_CR 0x8
enum crlf_action {
CRLF_UNDEFINED,
@@ -31,30 +32,36 @@ enum crlf_action {
struct text_stat {
/* NUL, CR, LF and CRLF counts */
- unsigned nul, lonecr, lonelf, crlf;
+ unsigned stat_bits, lonecr, lonelf, crlf;
/* These are just approximations! */
unsigned printable, nonprintable;
};
-static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
+static void do_gather_stats(const char *buf, unsigned long size,
+ struct text_stat *stats, unsigned earlyout)
{
unsigned long i;
- memset(stats, 0, sizeof(*stats));
-
+ if (!buf || !size)
+ return;
for (i = 0; i < size; i++) {
unsigned char c = buf[i];
if (c == '\r') {
+ stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
if (i+1 < size && buf[i+1] == '\n') {
stats->crlf++;
i++;
- } else
+ stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
+ } else {
stats->lonecr++;
+ stats->stat_bits |= CONVERT_STAT_BITS_BIN;
+ }
continue;
}
if (c == '\n') {
stats->lonelf++;
+ stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
continue;
}
if (c == 127)
@@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
stats->printable++;
break;
case 0:
- stats->nul++;
+ stats->stat_bits |= CONVERT_STAT_BITS_BIN;
/* fall through */
default:
stats->nonprintable++;
@@ -75,6 +82,8 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
}
else
stats->printable++;
+ if (stats->stat_bits & earlyout)
+ break; /* We found what we have been searching for */
}
/* If file ends with EOF then don't count this EOF as non-printable. */
@@ -86,41 +95,62 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat *
* The same heuristics as diff.c::mmfile_is_binary()
* We treat files with bare CR as binary
*/
-static int convert_is_binary(unsigned long size, const struct text_stat *stats)
+static void convert_nonprintable(struct text_stat *stats)
{
- if (stats->lonecr)
- return 1;
- if (stats->nul)
- return 1;
if ((stats->printable >> 7) < stats->nonprintable)
- return 1;
- return 0;
+ stats->stat_bits |= CONVERT_STAT_BITS_BIN;
+}
+
+static void gather_stats(const char *buf, unsigned long size,
+ struct text_stat *stats, unsigned earlyout)
+{
+ memset(stats, 0, sizeof(*stats));
+ do_gather_stats(buf, size, stats, earlyout);
+ convert_nonprintable(stats);
}
-static unsigned int gather_convert_stats(const char *data, unsigned long size)
+
+static unsigned get_convert_stats_sha1(unsigned const char *sha1,
+ unsigned earlyout)
{
+ struct git_istream *st;
struct text_stat stats;
- int ret = 0;
- if (!data || !size)
- return 0;
- gather_stats(data, size, &stats);
- if (convert_is_binary(size, &stats))
- ret |= CONVERT_STAT_BITS_BIN;
- if (stats.crlf)
- ret |= CONVERT_STAT_BITS_TXT_CRLF;
- if (stats.lonelf)
- ret |= CONVERT_STAT_BITS_TXT_LF;
+ enum object_type type;
+ unsigned long sz;
- return ret;
+ if (!sha1)
+ return 0;
+ memset(&stats, 0, sizeof(stats));
+ st = open_istream(sha1, &type, &sz, NULL);
+ if (!st) {
+ return 0;
+ }
+ if (type != OBJ_BLOB)
+ goto close_and_exit_i;
+ for (;;) {
+ char buf[1024];
+ ssize_t readlen = read_istream(st, buf, sizeof(buf));
+ if (readlen < 0)
+ break;
+ if (!readlen)
+ break;
+ do_gather_stats(buf, (unsigned long)readlen, &stats, earlyout);
+ if (stats.stat_bits & earlyout)
+ break; /* We found what we have been searching for */
+ }
+close_and_exit_i:
+ close_istream(st);
+ convert_nonprintable(&stats);
+ return stats.stat_bits;
}
-static const char *gather_convert_stats_ascii(const char *data, unsigned long size)
+static const char *convert_stats_ascii(unsigned convert_stats)
{
- unsigned int convert_stats = gather_convert_stats(data, size);
-
+ unsigned mask = CONVERT_STAT_BITS_TXT_LF |
+ CONVERT_STAT_BITS_TXT_CRLF;
if (convert_stats & CONVERT_STAT_BITS_BIN)
return "-text";
- switch (convert_stats) {
+ switch (convert_stats & mask) {
case CONVERT_STAT_BITS_TXT_LF:
return "lf";
case CONVERT_STAT_BITS_TXT_CRLF:
@@ -132,24 +162,45 @@ static const char *gather_convert_stats_ascii(const char *data, unsigned long si
}
}
+static unsigned get_convert_stats_wt(const char *path)
+{
+ struct text_stat stats;
+ unsigned earlyout = CONVERT_STAT_BITS_BIN;
+ int fd;
+ memset(&stats, 0, sizeof(stats));
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return 0;
+ for (;;) {
+ char buf[1024];
+ ssize_t readlen = read(fd, buf, sizeof(buf));
+ if (readlen < 0)
+ break;
+ if (!readlen)
+ break;
+ do_gather_stats(buf, (unsigned long)readlen, &stats, earlyout);
+ if (stats.stat_bits & earlyout)
+ break; /* We found what we have been searching for */
+ }
+ close(fd);
+ convert_nonprintable(&stats);
+ return stats.stat_bits;
+}
+
const char *get_cached_convert_stats_ascii(const char *path)
{
- const char *ret;
- unsigned long sz;
- void *data = read_blob_data_from_cache(path, &sz);
- ret = gather_convert_stats_ascii(data, sz);
- free(data);
- return ret;
+ unsigned convert_stats;
+ unsigned earlyout = CONVERT_STAT_BITS_BIN;
+ convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
+ earlyout);
+ return convert_stats_ascii(convert_stats);
}
const char *get_wt_convert_stats_ascii(const char *path)
{
- const char *ret = "";
- struct strbuf sb = STRBUF_INIT;
- if (strbuf_read_file(&sb, path, 0) >= 0)
- ret = gather_convert_stats_ascii(sb.buf, sb.len);
- strbuf_release(&sb);
- return ret;
+ unsigned convert_stats;
+ convert_stats = get_convert_stats_wt(path);
+ return convert_stats_ascii(convert_stats);
}
static int text_eol_is_crlf(void)
@@ -219,16 +270,10 @@ static void check_safe_crlf(const char *path, enum crlf_action crlf_action,
static int has_cr_in_index(const char *path)
{
- unsigned long sz;
- void *data;
- int has_cr;
-
- data = read_blob_data_from_cache(path, &sz);
- if (!data)
- return 0;
- has_cr = memchr(data, '\r', sz) != NULL;
- free(data);
- return has_cr;
+ unsigned convert_stats;
+ convert_stats = get_convert_stats_sha1(get_sha1_from_cache(path),
+ CONVERT_STAT_BITS_ANY_CR);
+ return convert_stats & CONVERT_STAT_BITS_ANY_CR;
}
static int crlf_to_git(const char *path, const char *src, size_t len,
@@ -249,10 +294,10 @@ static int crlf_to_git(const char *path, const char *src, size_t len,
if (!buf && !src)
return 1;
- gather_stats(src, len, &stats);
+ gather_stats(src, len, &stats, CONVERT_STAT_BITS_BIN);
if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
- if (convert_is_binary(len, &stats))
+ if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
return 0;
if (crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
@@ -309,11 +354,13 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
{
char *to_free = NULL;
struct text_stat stats;
+ unsigned earlyout = CONVERT_STAT_BITS_TXT_CRLF | CONVERT_STAT_BITS_BIN;
+
if (!len || output_eol(crlf_action) != EOL_CRLF)
return 0;
- gather_stats(src, len, &stats);
+ gather_stats(src, len, &stats, earlyout);
/* No "naked" LF? Nothing to convert, regardless. */
if (!stats.lonelf)
@@ -327,7 +374,7 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
return 0;
}
- if (convert_is_binary(len, &stats))
+ if (stats.stat_bits & CONVERT_STAT_BITS_BIN)
return 0;
}
--
2.0.0.rc1.6318.g0c2c796
next prev parent reply other threads:[~2016-05-07 6:06 UTC|newest]
Thread overview: 126+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <Message-Id=xmqqio26nqk8.fsf@gitster.mtv.corp.google.com>
2016-02-11 16:16 ` [PATCH 1/3] git reset --hard gives clean working tree tboegi
2016-02-11 18:49 ` Junio C Hamano
2016-03-05 7:23 ` Torsten Bögershausen
2016-03-05 8:05 ` Junio C Hamano
2016-03-05 8:27 ` Torsten Bögershausen
2016-03-05 21:18 ` Junio C Hamano
2016-03-07 8:14 ` Junio C Hamano
2016-03-07 8:51 ` Junio C Hamano
2016-03-07 8:58 ` Torsten Bögershausen
2016-03-07 22:34 ` Junio C Hamano
2016-03-29 13:25 ` [PATCH v1 1/7] Make it possible to get sha1 for a path from the index tboegi
2016-03-29 13:28 ` Duy Nguyen
2016-03-29 13:31 ` Duy Nguyen
2016-03-29 15:05 ` Torsten Bögershausen
2016-03-29 19:32 ` Eric Sunshine
2016-03-29 13:25 ` [PATCH v1 2/7] convert.c: stream and early out tboegi
2016-03-29 13:25 ` [PATCH v1 3/7] Allow core.autocrlf=input and core.eol=crlf tboegi
2016-03-29 13:25 ` [PATCH v1 4/7] t0027: TC for combined attributes tboegi
2016-03-29 13:25 ` [PATCH v1 5/7] CRLF: unify the "auto" handling tboegi
2016-03-29 19:42 ` Eric Sunshine
2016-03-29 13:25 ` [PATCH v1 6/7] correct blame for files commited with CRLF tboegi
2016-03-29 17:21 ` Junio C Hamano
2016-03-29 19:51 ` Torsten Bögershausen
2016-03-29 19:58 ` Junio C Hamano
2016-03-29 20:25 ` Junio C Hamano
2016-03-29 20:32 ` Junio C Hamano
2016-03-29 20:50 ` Junio C Hamano
2016-03-30 17:48 ` Torsten Bögershausen
2016-03-29 13:25 ` [PATCH v1 7/7] convert.c: more safer crlf handling with text attribute tboegi
2016-03-29 18:37 ` Junio C Hamano
2016-04-01 16:08 ` [PATCH v2 1/7] Make it possible to get sha1 for a path from the index tboegi
2016-04-01 16:08 ` [PATCH v2 2/7] convert.c: stream and early out tboegi
2016-04-01 16:08 ` [PATCH v2 3/7] Allow core.autocrlf=input and core.eol=crlf tboegi
2016-04-01 22:20 ` Junio C Hamano
2016-04-01 16:08 ` [PATCH v2 4/7] t0027: TC for combined attributes tboegi
2016-04-01 22:22 ` Junio C Hamano
2016-04-01 16:08 ` [PATCH v2 5/7] CRLF: unify the "auto" handling tboegi
2016-04-01 22:25 ` Junio C Hamano
2016-04-01 16:08 ` [PATCH v2 6/7] correct blame for files commited with CRLF tboegi
2016-04-01 22:29 ` Junio C Hamano
2016-04-03 9:29 ` Torsten Bögershausen
2016-04-01 16:08 ` [PATCH v2 7/7] convert.c: more safer crlf handling with text attribute tboegi
2016-04-05 19:23 ` [PATCH v1] correct blame for files commited with CRLF tboegi
2016-04-05 20:57 ` Junio C Hamano
2016-04-05 21:12 ` Junio C Hamano
2016-04-06 4:17 ` Torsten Bögershausen
2016-04-19 13:24 ` [PATCH v5 1/4] t0027: Make more reliable tboegi
2016-04-19 13:26 ` [PATCH v5 2/4] convert: allow core.autocrlf=input and core.eol=crlf tboegi
2016-04-19 13:26 ` [PATCH v5 3/4] t0027: test cases for combined attributes tboegi
2016-04-19 21:32 ` Junio C Hamano
2016-04-20 15:52 ` Torsten Bögershausen
2016-04-19 13:26 ` [PATCH v5 4/4] convert.c: ident + core.autocrlf didn't work tboegi
2016-04-20 22:27 ` Junio C Hamano
2016-04-22 14:38 ` [PATCH v6 01/10] t0027: Make more reliable tboegi
2016-04-22 22:03 ` Junio C Hamano
2016-04-24 3:45 ` Torsten Bögershausen
2016-04-22 14:53 ` [PATCH v6 02/10] convert: allow core.autocrlf=input and core.eol=crlf tboegi
2016-04-22 14:53 ` [PATCH v6 03/10] t0027: test cases for combined attributes tboegi
2016-04-22 14:53 ` [PATCH v6 04/10] convert.c: ident + core.autocrlf didn't work tboegi
2016-04-22 14:53 ` [PATCH v6 05/10] read-cache: factor out get_sha1_from_index() helper tboegi
2016-04-22 14:53 ` [PATCH v6 06/10] convert.c: stream and early out tboegi
2016-04-22 14:53 ` [PATCH v6 07/10] convert: unify the "auto" handling of CRLF tboegi
2016-04-22 14:53 ` [PATCH v6 08/10] convert.c: more safer crlf handling with text attribute tboegi
2016-04-22 14:53 ` [PATCH v6 09/10] t6038; use crlf on all platforms tboegi
2016-04-22 14:53 ` [PATCH v6 10/10] ce_compare_data() did not respect conversion tboegi
2016-04-24 15:10 ` [PATCH v6b 01/10] t0027: Make commit_chk_wrnNNO() reliable tboegi
2016-04-24 15:11 ` [PATCH v6b 02/10] convert: allow core.autocrlf=input and core.eol=crlf tboegi
2016-04-24 15:11 ` [PATCH v6b 03/10] t0027: test cases for combined attributes tboegi
2016-04-24 15:11 ` [PATCH v6b 04/10] convert.c: ident + core.autocrlf didn't work tboegi
2016-04-24 15:11 ` [PATCH v6b 05/10] read-cache: factor out get_sha1_from_index() helper tboegi
2016-04-24 15:11 ` [PATCH v6b 06/10] convert.c: stream and early out tboegi
2016-04-24 15:11 ` [PATCH v6b 07/10] convert: unify the "auto" handling of CRLF tboegi
2016-04-24 15:11 ` [PATCH v6b 08/10] convert.c: more safer crlf handling with text attribute tboegi
2016-04-24 15:11 ` [PATCH v6b 09/10] t6038; use crlf on all platforms tboegi
2016-04-24 15:11 ` [PATCH v6b 10/10] ce_compare_data() did not respect conversion tboegi
2016-04-25 16:56 ` [PATCH v7 01/10] t0027: Make commit_chk_wrnNNO() reliable tboegi
2016-04-25 19:15 ` Junio C Hamano
2016-04-25 16:56 ` [PATCH v7 02/10] convert: allow core.autocrlf=input and core.eol=crlf tboegi
2016-04-25 16:56 ` [PATCH v7 03/10] t0027: test cases for combined attributes tboegi
2016-04-25 16:56 ` [PATCH v7 04/10] convert.c: ident + core.autocrlf didn't work tboegi
2016-04-25 16:56 ` [PATCH v7 05/10] read-cache: factor out get_sha1_from_index() helper tboegi
2016-04-25 16:56 ` [PATCH v7 06/10] convert.c: stream and early out tboegi
2016-04-25 16:56 ` [PATCH v7 07/10] convert: unify the "auto" handling of CRLF tboegi
2016-04-25 19:37 ` Junio C Hamano
2016-04-26 16:33 ` Torsten Bögershausen
2016-04-26 17:42 ` Junio C Hamano
2016-04-25 16:56 ` [PATCH v7 08/10] convert.c: more safer crlf handling with text attribute tboegi
2016-04-25 16:56 ` [PATCH v7 09/10] t6038; use crlf on all platforms tboegi
2016-04-25 16:56 ` [PATCH v7 10/10] ce_compare_data() did not respect conversion tboegi
2016-04-29 15:01 ` [PATCH v8 01/10] t0027: make commit_chk_wrnNNO() reliable tboegi
2016-04-29 15:01 ` [PATCH v8 02/10] convert: allow core.autocrlf=input and core.eol=crlf tboegi
2016-04-29 15:01 ` [PATCH v8 03/10] t0027: test cases for combined attributes tboegi
2016-04-29 15:01 ` [PATCH v8 04/10] convert.c: ident + core.autocrlf didn't work tboegi
2016-04-29 15:02 ` [PATCH v8 05/10] read-cache: factor out get_sha1_from_index() helper tboegi
2016-04-29 15:02 ` [PATCH v8 06/10] convert.c: stream and early out tboegi
2016-04-29 15:02 ` [PATCH v8 07/10] convert: unify the "auto" handling of CRLF tboegi
2016-11-25 15:48 ` Torsten Bögershausen
2016-11-27 16:22 ` [PATCH/RFC v1 1/1] New way to normalize the line endings tboegi
2016-11-29 19:15 ` Junio C Hamano
2017-04-12 11:48 ` [PATCH v2 1/1] Document how " tboegi
2016-04-29 15:02 ` [PATCH v8 08/10] convert.c: more safer crlf handling with text attribute tboegi
2016-04-29 15:02 ` [PATCH v8 09/10] t6038; use crlf on all platforms tboegi
2016-04-29 15:02 ` [PATCH v8 10/10] ce_compare_data() did not respect conversion tboegi
2016-04-29 18:20 ` Junio C Hamano
2016-04-29 21:09 ` Junio C Hamano
2016-05-01 16:27 ` Torsten Bögershausen
2016-05-02 18:16 ` Junio C Hamano
2016-05-02 19:33 ` Junio C Hamano
2016-05-03 16:02 ` Torsten Bögershausen
2016-05-03 18:31 ` Junio C Hamano
2016-05-04 4:07 ` Torsten Bögershausen
2016-05-04 7:23 ` Junio C Hamano
2016-05-06 8:54 ` Torsten Bögershausen
2016-05-06 17:11 ` Junio C Hamano
2016-05-07 6:10 ` [PATCH v9 0/6] convert-eol-autocrlf, old 5..10 now 1..6 tboegi
2016-05-07 6:10 ` [PATCH v9 1/6] read-cache: factor out get_sha1_from_index() helper tboegi
2016-05-09 19:54 ` Junio C Hamano
2016-05-07 6:11 ` tboegi [this message]
2016-05-09 20:29 ` [PATCH v9 2/6] convert.c: stream and early out Junio C Hamano
2016-05-11 4:30 ` Torsten Bögershausen
2016-05-07 6:11 ` [PATCH v9 3/6] convert: unify the "auto" handling of CRLF tboegi
2016-05-07 6:11 ` [PATCH v9 4/6] convert.c: more safer crlf handling with text attribute tboegi
2016-05-07 6:11 ` [PATCH v9 5/6] t6038; use crlf on all platforms tboegi
2016-05-07 6:11 ` [PATCH v9 6/6] convert: ce_compare_data() checks for a sha1 of a path tboegi
2016-02-11 16:16 ` [PATCH 2/3] Factor out convert_cmp_checkout() into convert.c tboegi
2016-02-11 16:16 ` [PATCH 3/3] convert.c: Optimize convert_cmp_checkout() for changed file len tboegi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1462601460-23543-1-git-send-email-tboegi@web.de \
--to=tboegi@web.de \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).