git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
From: "Torsten Bögershausen" <totte.enea@gmail.com>
To: Michael J Gruber <git@drmicha.warpmail.net>
Cc: "Ævar Arnfjörð Bjarmason" <avarab@gmail.com>,
	matthias.moeller@math.tu-dortmund.de, git@vger.kernel.org
Subject: Re: Git, Mac OS X and German special characters
Date: Thu, 20 May 2010 11:02:01 +0200	[thread overview]
Message-ID: <4BF4FA89.2040904@gmail.com> (raw)
In-Reply-To: <4BF4F7D7.60002@drmicha.warpmail.net>

Hej,
I have the same problem here.
Below there is a patch, which may solve the problem.
(Yes, whitespaces are broken. I'm still fighting with
git format-patch -s --cover-letter -M --stdout origin/master | git 
imap-send)
But this patch may be a start point for improvements.
Comments welcome
BR
/Torsten



Improved interwork between Mac OS X and linux when umlauts are used
When a git repository containing utf-8 coded umlaut characters
is cloned onto an Mac OS X machine, the Mac OS system will convert
all filenames returned by readdir() into denormalized utf-8.
As a result of this conversion, git will not find them on disk.
This helps by treating the NFD and NFD version of filenames as
identical on Mac OS.






Signed-off-by: Torsten Bögershausen <tboegi@web.de>
---
name-hash.c |   40 ++++++++++++++++++++++++++++++++++++++++
utf8.c      |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
utf8.h      |   11 +++++++++++
3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/name-hash.c b/name-hash.c
index 0031d78..e6494e8 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -7,6 +7,7 @@
  */
#define NO_THE_INDEX_COMPATIBILITY_MACROS
#include "cache.h"
+#include "utf8.h"

/*
  * This removes bit 5 if bit 6 is set.
@@ -100,6 +101,25 @@ static int same_name(const struct cache_entry *ce, 
const char *name, int namelen
     return icase && slow_same_name(name, namelen, ce->name, len);
}

+#ifdef __APPLE__
+struct cache_entry *index_name_exists2(struct index_state *istate, 
const char *name, int icase)
+{
+    int namelen = (int)strlen(name);
+    unsigned int hash = hash_name(name, namelen);
+    struct cache_entry *ce;
+
+    ce = lookup_hash(hash, &istate->name_hash);
+    while (ce) {
+        if (!(ce->ce_flags & CE_UNHASHED)) {
+            if (same_name(ce, name, namelen, icase))
+                return ce;
+        }
+        ce = ce->next;
+    }
+    return NULL;
+}
+#endif
+
struct cache_entry *index_name_exists(struct index_state *istate, const 
char *name, int namelen, int icase)
{
     unsigned int hash = hash_name(name, namelen);
@@ -115,5 +135,25 @@ struct cache_entry *index_name_exists(struct 
index_state *istate, const char *na
         }
         ce = ce->next;
     }
+#ifdef __APPLE__
+    {
+        char *name_nfc_nfd;
+        name_nfc_nfd = str_nfc2nfd(name);
+        if (name_nfc_nfd) {
+            ce = index_name_exists2(istate, name_nfc_nfd, icase);
+            free(name_nfc_nfd);
+            if (ce)
+                return ce;
+        }
+        name_nfc_nfd = str_nfd2nfc(name);
+        if (name_nfc_nfd) {
+            ce = index_name_exists2(istate, name_nfc_nfd, icase);
+            free(name_nfc_nfd);
+            if (ce)
+                return ce;
+        }
+    }
+#endif
+
     return NULL;
}
diff --git a/utf8.c b/utf8.c
index 84cfc72..8e794dc 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2,6 +2,11 @@
#include "strbuf.h"
#include "utf8.h"

+#ifdef __APPLE__
+static iconv_t my_iconv_nfd2nfc = (iconv_t) -1;
+static iconv_t my_iconv_nfc2nfd = (iconv_t) -1;
+#endif
+
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */

struct interval {
@@ -424,18 +429,13 @@ int is_encoding_utf8(const char *name)
#else
     typedef char * iconv_ibp;
#endif
-char *reencode_string(const char *in, const char *out_encoding, const 
char *in_encoding)
+
+char *reencode_string_iconv(const char *in, iconv_t conv)
{
-    iconv_t conv;
     size_t insz, outsz, outalloc;
     char *out, *outpos;
     iconv_ibp cp;

-    if (!in_encoding)
-        return NULL;
-    conv = iconv_open(out_encoding, in_encoding);
-    if (conv == (iconv_t) -1)
-        return NULL;
     insz = strlen(in);
     outsz = insz;
     outalloc = outsz + 1; /* for terminating NUL */
@@ -469,7 +469,48 @@ char *reencode_string(const char *in, const char 
*out_encoding, const char *in_e
             break;
         }
     }
+    return out;
+}
+
+char *reencode_string(const char *in, const char *out_encoding, const 
char *in_encoding)
+{
+    iconv_t conv;
+    char *out;
+
+    if (!in_encoding)
+        return NULL;
+    conv = iconv_open(out_encoding, in_encoding);
+    if (conv == (iconv_t) -1)
+        return NULL;
+    out = reencode_string_iconv(in, conv);
     iconv_close(conv);
     return out;
}
+
+#ifdef __APPLE__
+char*
+str_nfc2nfd(const char *in)
+{
+    if (my_iconv_nfc2nfd == (iconv_t) -1) {
+        my_iconv_nfc2nfd = iconv_open("utf-8-mac", "utf-8");
+        if (my_iconv_nfc2nfd == (iconv_t) -1) {
+            return NULL;
+        }
+    }
+    return reencode_string_iconv(in, my_iconv_nfc2nfd);
+}
+
+char*
+str_nfd2nfc(const char *in)
+{
+    if (my_iconv_nfd2nfc == (iconv_t) -1){
+        my_iconv_nfd2nfc = iconv_open("utf-8", "utf-8-mac");
+        if (my_iconv_nfd2nfc == (iconv_t) -1) {
+            return NULL;
+        }
+    }
+    return reencode_string_iconv(in, my_iconv_nfd2nfc);
+}
+#endif /* APPLE */
+
#endif
diff --git a/utf8.h b/utf8.h
index ebc4d2f..db29c8a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -13,8 +13,19 @@ int strbuf_add_wrapped_text(struct strbuf *buf,

#ifndef NO_ICONV
char *reencode_string(const char *in, const char *out_encoding, const 
char *in_encoding);
+char *reencode_string_iconv(const char *in, iconv_t conv);
+#ifdef __APPLE__
+char *str_nfc2nfd(const char *in);
+char *str_nfd2nfc(const char *in);
+#else
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
+#endif
#else
#define reencode_string(a,b,c) NULL
+#define reencode_string2(a,b) NULL
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
#endif

#endif
-- 
1.7.1.dirty










On 20.05.10 10:50, Michael J Gruber wrote:
> Ævar Arnfjörð Bjarmason venit, vidit, dixit 20.05.2010 10:34:
>    
>> On Thu, May 20, 2010 at 07:26, Matthias Moeller
>> <matthias.moeller@math.tu-dortmund.de>  wrote:
>>      
>>> I have been searching the web for help and found lengthy discussions
>>> which state that this is a common problem of the HFS+ filesystem.
>>> What I did not find was a solution to this problem. Is there a solution
>>> to this problem?
>>>        
>> Is this problem particular to Git, or do you also get it if you
>> e.g. rsync from the Linux box to the Mac OS X box?
>>
>>      
>>> #       "U\314\210bersicht.xls"
>>>        
>> You probably have to configure your shell on OSX to render UTF-8
>> correctly. It's just showing the raw escaped byte sequence instead of
>> a character there.
>>
>> There isn't anything wrong with OSX in this case, filename encoding on
>> any POSIX system is only done by convention. You'll find that you have
>> similar problems on Linux if you encode filename in Big5 or
>> UTF-32.
>>
>> Linux will happily accept it, but your shell / other applications will
>> render it as unknown goo because they expect UTF-8.
>>      
> No, the problem with git status is not the display. Matthias' problem is
> that git status reports a tracked file as untracked. The reason is that
> on HFS+, you create a file with name A and get a file with name B, where
> A and B are different representations of the same name. There seems to
> be no way to reliably detect which one HFS+ uses.
>
> Michael
> --
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>    

  parent reply	other threads:[~2010-05-20  9:02 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-05-20  7:26 Git, Mac OS X and German special characters Matthias Moeller
2010-05-20  8:34 ` Ævar Arnfjörð Bjarmason
2010-05-20  8:50   ` Michael J Gruber
2010-05-20  8:57     ` demerphq
2010-05-20  9:02     ` Torsten Bögershausen [this message]
2010-05-20  9:15       ` Michael J Gruber
     [not found]         ` <4BF5294E.7060206@web.de>
2010-05-20 14:29           ` Michael J Gruber
2010-05-20 15:30         ` Jay Soffian
2010-05-20 15:50       ` Jay Soffian
2010-05-20 18:22         ` Jay Soffian
2010-05-20  9:16     ` Matthias Moeller
2010-05-20 10:38     ` Thomas Singer
2010-05-20  8:55   ` demerphq
  -- strict thread matches above, loose matches on Subject: below --
2011-10-01 12:44 Albert Zeyer
2011-10-01 13:39 ` Andreas Ericsson
     [not found]   ` <CAO1Q+jeLEp2ReNc9eOFoJxdGq6oRE3b+O=JvMNU0Kqx_eAX=7w@mail.gmail.com>
2011-10-01 14:24     ` Andreas Ericsson
2011-10-01 19:47       ` Andreas Krey
2011-10-01 22:02         ` Michael Witten
2011-10-01 23:14           ` Jakub Narebski
2011-10-01 23:26             ` Michael Witten
2011-10-01 23:48           ` Albert Zeyer
2011-10-03 19:48 ` Torsten Bögershausen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: http://vger.kernel.org/majordomo-info.html

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4BF4FA89.2040904@gmail.com \
    --to=totte.enea@gmail.com \
    --cc=avarab@gmail.com \
    --cc=git@drmicha.warpmail.net \
    --cc=git@vger.kernel.org \
    --cc=matthias.moeller@math.tu-dortmund.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).