user/dev discussion of public-inbox itself
 help / color / mirror / code / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 03/12] lei_mail_sync: account for non-unique cases
Date: Tue, 21 Sep 2021 07:41:50 +0000	[thread overview]
Message-ID: <20210921074159.20052-4-e@80x24.org> (raw)
In-Reply-To: <20210921074159.20052-1-e@80x24.org>

NNTP servers, IMAP servers, and various MUAs may recycle
"unique" identifiers due to software bugs or careless BOFHs.
Warn about them, but always be prepared to account for them.
---
 lib/PublicInbox/LeiImport.pm          |  8 ++++++--
 lib/PublicInbox/LeiImportKw.pm        | 12 ++++++++----
 lib/PublicInbox/LeiInspect.pm         | 14 +++++++-------
 lib/PublicInbox/LeiLcat.pm            |  7 ++-----
 lib/PublicInbox/LeiMailSync.pm        | 20 ++++++++++----------
 lib/PublicInbox/LeiRefreshMailSync.pm |  2 +-
 6 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/lib/PublicInbox/LeiImport.pm b/lib/PublicInbox/LeiImport.pm
index 40530914..3c30db8d 100644
--- a/lib/PublicInbox/LeiImport.pm
+++ b/lib/PublicInbox/LeiImport.pm
@@ -37,8 +37,12 @@ sub pmdir_cb { # called via wq_io_do from LeiPmdir->each_mdir_fn
 	substr($folder, 0, 0) = 'maildir:'; # add prefix
 	my $lse = $self->{lse} //= $self->{lei}->{sto}->search;
 	my $lms = $self->{-lms_ro} //= $self->{lei}->lms; # may be 0 or undef
-	my $oidbin = $lms ? $lms->name_oidbin($folder, $bn) : undef;
-	my @docids = defined($oidbin) ? $lse->over->oidbin_exists($oidbin) : ();
+	my @oidbin = $lms ? $lms->name_oidbin($folder, $bn) : ();
+	@oidbin > 1 and $self->{lei}->err("W: $folder/*/$$bn not unique:\n",
+				map { "\t".unpack('H*', $_)."\n" } @oidbin);
+	my %seen;
+	my @docids = sort { $a <=> $b } grep { !$seen{$_}++ }
+			map { $lse->over->oidbin_exists($_) } @oidbin;
 	my $vmd = $self->{-import_kw} ? { kw => $kw } : undef;
 	if (scalar @docids) {
 		$lse->kw_changed(undef, $kw, \@docids) or return;
diff --git a/lib/PublicInbox/LeiImportKw.pm b/lib/PublicInbox/LeiImportKw.pm
index 379101c2..21c93515 100644
--- a/lib/PublicInbox/LeiImportKw.pm
+++ b/lib/PublicInbox/LeiImportKw.pm
@@ -34,11 +34,15 @@ sub ipc_atfork_child {
 
 sub ck_update_kw { # via wq_io_do
 	my ($self, $url, $uid, $kw) = @_;
-	my $oidbin = $self->{-lms_ro}->imap_oidbin($url, $uid) // return;
-	my @docids = $self->{over}->oidbin_exists($oidbin) or return;
+	my @oidbin = $self->{-lms_ro}->num_oidbin($url, $uid);
+	my $uid_url = "$url/;UID=$uid";
+	@oidbin > 1 and $self->{lei}->err("W: $uid_url not unique:\n",
+				map { "\t".unpack('H*', $_)."\n" } @oidbin);
+	my %seen;
+	my @docids = sort { $a <=> $b } grep { !$seen{$_}++ }
+		map { $self->{over}->oidbin_exists($_) } @oidbin;
 	$self->{lse}->kw_changed(undef, $kw, \@docids) or return;
-	$self->{verbose} and
-		$self->{lei}->qerr('# '.unpack('H*', $oidbin)." => @$kw\n");
+	$self->{verbose} and $self->{lei}->qerr("# $uid_url => @$kw\n");
 	$self->{sto}->wq_do('set_eml_vmd', undef, { kw => $kw }, \@docids);
 }
 
diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index ab2c98d9..722ba5b2 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -32,11 +32,9 @@ sub inspect_imap_uid ($$) {
 	my ($lei, $uid_uri) = @_;
 	my $ent = {};
 	my $lms = $lei->lms or return $ent;
-	my $oidhex = $lms->imap_oid($lei, $uid_uri);
-	if (ref(my $err = $oidhex)) { # arg2folder error
-		$lei->qerr(@{$err->{qerr}}) if $err->{qerr};
-	}
-	$ent->{$$uid_uri} = $oidhex;
+	my @oidhex = $lms->imap_oidhex($lei, $uid_uri);
+	$ent->{$$uid_uri} = @oidhex == 1 ? $oidhex[0] :
+			((@oidhex == 0) ? undef : \@oidhex);
 	$ent;
 }
 
@@ -54,8 +52,10 @@ sub inspect_nntp_range {
 	}
 	$end //= $beg;
 	for my $art ($beg..$end) {
-		my $oidbin = $lms->imap_oidbin($folders->[0], $art);
-		$ent->{$art} = $oidbin ? unpack('H*', $oidbin) : undef;
+		my @oidhex = map { unpack('H*', $_) }
+			$lms->num_oidbin($folders->[0], $art);
+		$ent->{$art} = @oidhex == 1 ? $oidhex[0] :
+				((@oidhex == 0) ? undef : \@oidhex);
 	}
 	$ret;
 }
diff --git a/lib/PublicInbox/LeiLcat.pm b/lib/PublicInbox/LeiLcat.pm
index 1e54c3bf..8f8e83bc 100644
--- a/lib/PublicInbox/LeiLcat.pm
+++ b/lib/PublicInbox/LeiLcat.pm
@@ -32,11 +32,8 @@ sub lcat_imap_uri ($$) {
 	my $lms = $lei->lms or return;
 	# cf. LeiXsearch->lcat_dump
 	if (defined $uri->uid) {
-		my $oidhex = $lms->imap_oid($lei, $uri);
-		if (ref(my $err = $oidhex)) { # art2folder error
-			$lei->qerr(@{$err->{qerr}}) if $err->{qerr};
-		}
-		push @{$lei->{lcat_blob}}, $oidhex;
+		my @oidhex = $lms->imap_oidhex($lei, $uri);
+		push @{$lei->{lcat_blob}}, @oidhex;
 	} elsif (defined(my $fid = $lms->fid_for($$uri))) {
 		push @{$lei->{lcat_fid}}, $fid;
 	} else {
diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm
index d9b9e117..3e725d30 100644
--- a/lib/PublicInbox/LeiMailSync.pm
+++ b/lib/PublicInbox/LeiMailSync.pm
@@ -66,6 +66,7 @@ CREATE TABLE IF NOT EXISTS blob2num (
 	oidbin VARBINARY NOT NULL,
 	fid INTEGER NOT NULL, /* folder ID */
 	uid INTEGER NOT NULL, /* NNTP article number, IMAP UID, MH number */
+	/* not UNIQUE(fid, uid), since we may have broken servers */
 	UNIQUE (oidbin, fid, uid)
 )
 
@@ -78,6 +79,7 @@ CREATE TABLE IF NOT EXISTS blob2name (
 	oidbin VARBINARY NOT NULL,
 	fid INTEGER NOT NULL, /* folder ID */
 	name VARBINARY NOT NULL, /* Maildir basename, JMAP blobId */
+	/* not UNIQUE(fid, name), since we may have broken software */
 	UNIQUE (oidbin, fid, name)
 )
 
@@ -522,14 +524,14 @@ EOM
 	}
 }
 
-sub imap_oidbin ($$$) {
-	my ($self, $url, $uid) = @_; # $url MUST have UIDVALIDITY
-	my $fid = $self->{fmap}->{$url} //= fid_for($self, $url) // return;
+sub num_oidbin ($$$) {
+	my ($self, $url, $uid) = @_; # $url MUST have UIDVALIDITY if IMAP
+	my $fid = $self->{fmap}->{$url} //= fid_for($self, $url) // return ();
 	my $sth = $self->{dbh}->prepare_cached(<<EOM, undef, 1);
-SELECT oidbin FROM blob2num WHERE fid = ? AND uid = ?
+SELECT oidbin FROM blob2num WHERE fid = ? AND uid = ? ORDER BY _rowid_
 EOM
 	$sth->execute($fid, $uid);
-	$sth->fetchrow_array;
+	map { $_->[0] } @{$sth->fetchall_arrayref};
 }
 
 sub name_oidbin ($$$) {
@@ -539,10 +541,10 @@ sub name_oidbin ($$$) {
 SELECT oidbin FROM blob2name WHERE fid = ? AND name = ?
 EOM
 	$sth->execute($fid, $nm);
-	$sth->fetchrow_array;
+	map { $_->[0] } @{$sth->fetchall_arrayref};
 }
 
-sub imap_oid {
+sub imap_oidhex {
 	my ($self, $lei, $uid_uri) = @_;
 	my $mailbox_uri = $uid_uri->clone;
 	$mailbox_uri->uid(undef);
@@ -550,12 +552,10 @@ sub imap_oid {
 	if (my $err = $self->arg2folder($lei, $folders)) {
 		if ($err->{fail}) {
 			$lei->qerr("# no sync information for $mailbox_uri");
-			return;
 		}
 		$lei->qerr(@{$err->{qerr}}) if $err->{qerr};
 	}
-	my $oidbin = imap_oidbin($self, $folders->[0], $uid_uri->uid);
-	$oidbin ? unpack('H*', $oidbin) : undef;
+	map { unpack('H*',$_) } num_oidbin($self, $folders->[0], $uid_uri->uid)
 }
 
 1;
diff --git a/lib/PublicInbox/LeiRefreshMailSync.pm b/lib/PublicInbox/LeiRefreshMailSync.pm
index 2f105005..92673492 100644
--- a/lib/PublicInbox/LeiRefreshMailSync.pm
+++ b/lib/PublicInbox/LeiRefreshMailSync.pm
@@ -36,7 +36,7 @@ sub pmdir_cb { # called via LeiPmdir->each_mdir_fn
 	my ($folder, $bn) = ($f =~ m!\A(.+?)/(?:new|cur)/([^/]+)\z!) or
 		die "BUG: $f was not from a Maildir?";
 	substr($folder, 0, 0) = 'maildir:'; # add prefix
-	return if defined($self->{lms}->name_oidbin($folder, $bn));
+	return if scalar($self->{lms}->name_oidbin($folder, $bn));
 	my $eml = eml_from_path($f) // return;
 	my $oidbin = $self->{lei}->git_oid($eml)->digest;
 	$self->{lms}->set_src($oidbin, $folder, \$bn);

  parent reply	other threads:[~2021-09-21  7:41 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-21  7:41 [PATCH 00/12] lei: fix various annoyances Eric Wong
2021-09-21  7:41 ` [PATCH 01/12] lei inspect: convert to WQ worker Eric Wong
2021-09-21  7:41 ` [PATCH 02/12] lei inspect: support NNTP URLs Eric Wong
2021-09-21  7:41 ` Eric Wong [this message]
2021-09-21  7:41 ` [PATCH 04/12] lei: simplify internal arg2folder usage Eric Wong
2021-09-21  7:41 ` [PATCH 05/12] lei lcat: use single queue for ordering Eric Wong
2021-09-21  7:41 ` [PATCH 06/12] doc: lei-security: section for WIP auth methods Eric Wong
2021-09-21  7:41 ` [PATCH 07/12] lei lcat: support NNTP URLs Eric Wong
2021-09-21  7:41 ` [PATCH 08/12] lei: various completion improvements Eric Wong
2021-09-21  7:41 ` [PATCH 09/12] lei q: show progress on >1s preparation phase Eric Wong
2021-09-21  7:41 ` [PATCH 10/12] search: drop reopen retry message Eric Wong
2021-09-21  7:41 ` [PATCH 11/12] lei q: update messages to reflect --save default Eric Wong
2021-09-21  7:41 ` [PATCH 12/12] lei q: improve --limit behavior and progress Eric Wong
2021-09-21  9:29 ` [PATCH 0/3] lei: a few more annoyances fixed Eric Wong
2021-09-21  9:29   ` [PATCH 1/3] t/lei-up: use '-q' to silence non-redirected test Eric Wong
2021-09-21  9:29   ` [PATCH 2/3] script/lei: handle SIGTSTP and SIGCONT Eric Wong
2021-09-21  9:29   ` [PATCH 3/3] lei: umask(077) before opening errors.log Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210921074159.20052-4-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://80x24.org/public-inbox.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).