From: "Torsten Bögershausen" <totte.enea@gmail.com>
To: Michael J Gruber <git@drmicha.warpmail.net>
Cc: "Ævar Arnfjörð Bjarmason" <avarab@gmail.com>,
matthias.moeller@math.tu-dortmund.de, git@vger.kernel.org
Subject: Re: Git, Mac OS X and German special characters
Date: Thu, 20 May 2010 11:02:01 +0200 [thread overview]
Message-ID: <4BF4FA89.2040904@gmail.com> (raw)
In-Reply-To: <4BF4F7D7.60002@drmicha.warpmail.net>
Hej,
I have the same problem here.
Below there is a patch, which may solve the problem.
(Yes, whitespaces are broken. I'm still fighting with
git format-patch -s --cover-letter -M --stdout origin/master | git
imap-send)
But this patch may be a start point for improvements.
Comments welcome
BR
/Torsten
Improved interwork between Mac OS X and linux when umlauts are used
When a git repository containing utf-8 coded umlaut characters
is cloned onto an Mac OS X machine, the Mac OS system will convert
all filenames returned by readdir() into denormalized utf-8.
As a result of this conversion, git will not find them on disk.
This helps by treating the NFD and NFD version of filenames as
identical on Mac OS.
Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
name-hash.c | 40 ++++++++++++++++++++++++++++++++++++++++
utf8.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
utf8.h | 11 +++++++++++
3 files changed, 99 insertions(+), 7 deletions(-)
diff --git a/name-hash.c b/name-hash.c
index 0031d78..e6494e8 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -7,6 +7,7 @@
*/
#define NO_THE_INDEX_COMPATIBILITY_MACROS
#include "cache.h"
+#include "utf8.h"
/*
* This removes bit 5 if bit 6 is set.
@@ -100,6 +101,25 @@ static int same_name(const struct cache_entry *ce,
const char *name, int namelen
return icase && slow_same_name(name, namelen, ce->name, len);
}
+#ifdef __APPLE__
+struct cache_entry *index_name_exists2(struct index_state *istate,
const char *name, int icase)
+{
+ int namelen = (int)strlen(name);
+ unsigned int hash = hash_name(name, namelen);
+ struct cache_entry *ce;
+
+ ce = lookup_hash(hash, &istate->name_hash);
+ while (ce) {
+ if (!(ce->ce_flags & CE_UNHASHED)) {
+ if (same_name(ce, name, namelen, icase))
+ return ce;
+ }
+ ce = ce->next;
+ }
+ return NULL;
+}
+#endif
+
struct cache_entry *index_name_exists(struct index_state *istate, const
char *name, int namelen, int icase)
{
unsigned int hash = hash_name(name, namelen);
@@ -115,5 +135,25 @@ struct cache_entry *index_name_exists(struct
index_state *istate, const char *na
}
ce = ce->next;
}
+#ifdef __APPLE__
+ {
+ char *name_nfc_nfd;
+ name_nfc_nfd = str_nfc2nfd(name);
+ if (name_nfc_nfd) {
+ ce = index_name_exists2(istate, name_nfc_nfd, icase);
+ free(name_nfc_nfd);
+ if (ce)
+ return ce;
+ }
+ name_nfc_nfd = str_nfd2nfc(name);
+ if (name_nfc_nfd) {
+ ce = index_name_exists2(istate, name_nfc_nfd, icase);
+ free(name_nfc_nfd);
+ if (ce)
+ return ce;
+ }
+ }
+#endif
+
return NULL;
}
diff --git a/utf8.c b/utf8.c
index 84cfc72..8e794dc 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2,6 +2,11 @@
#include "strbuf.h"
#include "utf8.h"
+#ifdef __APPLE__
+static iconv_t my_iconv_nfd2nfc = (iconv_t) -1;
+static iconv_t my_iconv_nfc2nfd = (iconv_t) -1;
+#endif
+
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */
struct interval {
@@ -424,18 +429,13 @@ int is_encoding_utf8(const char *name)
#else
typedef char * iconv_ibp;
#endif
-char *reencode_string(const char *in, const char *out_encoding, const
char *in_encoding)
+
+char *reencode_string_iconv(const char *in, iconv_t conv)
{
- iconv_t conv;
size_t insz, outsz, outalloc;
char *out, *outpos;
iconv_ibp cp;
- if (!in_encoding)
- return NULL;
- conv = iconv_open(out_encoding, in_encoding);
- if (conv == (iconv_t) -1)
- return NULL;
insz = strlen(in);
outsz = insz;
outalloc = outsz + 1; /* for terminating NUL */
@@ -469,7 +469,48 @@ char *reencode_string(const char *in, const char
*out_encoding, const char *in_e
break;
}
}
+ return out;
+}
+
+char *reencode_string(const char *in, const char *out_encoding, const
char *in_encoding)
+{
+ iconv_t conv;
+ char *out;
+
+ if (!in_encoding)
+ return NULL;
+ conv = iconv_open(out_encoding, in_encoding);
+ if (conv == (iconv_t) -1)
+ return NULL;
+ out = reencode_string_iconv(in, conv);
iconv_close(conv);
return out;
}
+
+#ifdef __APPLE__
+char*
+str_nfc2nfd(const char *in)
+{
+ if (my_iconv_nfc2nfd == (iconv_t) -1) {
+ my_iconv_nfc2nfd = iconv_open("utf-8-mac", "utf-8");
+ if (my_iconv_nfc2nfd == (iconv_t) -1) {
+ return NULL;
+ }
+ }
+ return reencode_string_iconv(in, my_iconv_nfc2nfd);
+}
+
+char*
+str_nfd2nfc(const char *in)
+{
+ if (my_iconv_nfd2nfc == (iconv_t) -1){
+ my_iconv_nfd2nfc = iconv_open("utf-8", "utf-8-mac");
+ if (my_iconv_nfd2nfc == (iconv_t) -1) {
+ return NULL;
+ }
+ }
+ return reencode_string_iconv(in, my_iconv_nfd2nfc);
+}
+#endif /* APPLE */
+
#endif
diff --git a/utf8.h b/utf8.h
index ebc4d2f..db29c8a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -13,8 +13,19 @@ int strbuf_add_wrapped_text(struct strbuf *buf,
#ifndef NO_ICONV
char *reencode_string(const char *in, const char *out_encoding, const
char *in_encoding);
+char *reencode_string_iconv(const char *in, iconv_t conv);
+#ifdef __APPLE__
+char *str_nfc2nfd(const char *in);
+char *str_nfd2nfc(const char *in);
+#else
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
+#endif
#else
#define reencode_string(a,b,c) NULL
+#define reencode_string2(a,b) NULL
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
#endif
#endif
--
1.7.1.dirty
On 20.05.10 10:50, Michael J Gruber wrote:
> Ævar Arnfjörð Bjarmason venit, vidit, dixit 20.05.2010 10:34:
>
>> On Thu, May 20, 2010 at 07:26, Matthias Moeller
>> <matthias.moeller@math.tu-dortmund.de> wrote:
>>
>>> I have been searching the web for help and found lengthy discussions
>>> which state that this is a common problem of the HFS+ filesystem.
>>> What I did not find was a solution to this problem. Is there a solution
>>> to this problem?
>>>
>> Is this problem particular to Git, or do you also get it if you
>> e.g. rsync from the Linux box to the Mac OS X box?
>>
>>
>>> # "U\314\210bersicht.xls"
>>>
>> You probably have to configure your shell on OSX to render UTF-8
>> correctly. It's just showing the raw escaped byte sequence instead of
>> a character there.
>>
>> There isn't anything wrong with OSX in this case, filename encoding on
>> any POSIX system is only done by convention. You'll find that you have
>> similar problems on Linux if you encode filename in Big5 or
>> UTF-32.
>>
>> Linux will happily accept it, but your shell / other applications will
>> render it as unknown goo because they expect UTF-8.
>>
> No, the problem with git status is not the display. Matthias' problem is
> that git status reports a tracked file as untracked. The reason is that
> on HFS+, you create a file with name A and get a file with name B, where
> A and B are different representations of the same name. There seems to
> be no way to reliably detect which one HFS+ uses.
>
> Michael
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
next prev parent reply other threads:[~2010-05-20 9:02 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-05-20 7:26 Git, Mac OS X and German special characters Matthias Moeller
2010-05-20 8:34 ` Ævar Arnfjörð Bjarmason
2010-05-20 8:50 ` Michael J Gruber
2010-05-20 8:57 ` demerphq
2010-05-20 9:02 ` Torsten Bögershausen [this message]
2010-05-20 9:15 ` Michael J Gruber
[not found] ` <4BF5294E.7060206@web.de>
2010-05-20 14:29 ` Michael J Gruber
2010-05-20 15:30 ` Jay Soffian
2010-05-20 15:50 ` Jay Soffian
2010-05-20 18:22 ` Jay Soffian
2010-05-20 9:16 ` Matthias Moeller
2010-05-20 10:38 ` Thomas Singer
2010-05-20 8:55 ` demerphq
-- strict thread matches above, loose matches on Subject: below --
2011-10-01 12:44 Albert Zeyer
2011-10-01 13:39 ` Andreas Ericsson
[not found] ` <CAO1Q+jeLEp2ReNc9eOFoJxdGq6oRE3b+O=JvMNU0Kqx_eAX=7w@mail.gmail.com>
2011-10-01 14:24 ` Andreas Ericsson
2011-10-01 19:47 ` Andreas Krey
2011-10-01 22:02 ` Michael Witten
2011-10-01 23:14 ` Jakub Narebski
2011-10-01 23:26 ` Michael Witten
2011-10-01 23:48 ` Albert Zeyer
2011-10-03 19:48 ` Torsten Bögershausen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: http://vger.kernel.org/majordomo-info.html
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4BF4FA89.2040904@gmail.com \
--to=totte.enea@gmail.com \
--cc=avarab@gmail.com \
--cc=git@drmicha.warpmail.net \
--cc=git@vger.kernel.org \
--cc=matthias.moeller@math.tu-dortmund.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://80x24.org/mirrors/git.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).