From 58a5bb3e18901237b1ca34ef8f03f696be27d305 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 11 Aug 2016 00:23:48 +0000 Subject: search: support alt-ID for mapping legacy serial numbers For some existing mailing list archives, messages are identified by serial number (such as NNTP article numbers in gmane). Those links may become inaccessible (as is the current case for gmane), so ensure users can still search based on old serial numbers. Now, I run the following periodically to get article numbers from gmane (while news.gmane.org remains): NNTPSERVER=news.gmane.org export NNTPSERVER GROUP=gmane.comp.version-control.git perl -I lib scripts/xhdr-num2mid $GROUP --msgmap=/path/to/gmane.sqlite3 (I might integrate this further with public-inbox-* scripts one day). My ~/.public-inbox/config as an added "altid" snippet which now looks like this: [publicinbox "git"] address = git@vger.kernel.org mainrepo = /path/to/git.vger.git newsgroup = inbox.comp.version-control.git ; relative pathnames expand to $mainrepo/public-inbox/$file altid = serial:gmane:file=gmane.sqlite3 And run "public-inbox-index --reindex /path/to/git.vger.git" periodically. This ought to allow searching for "gmane:12345" to work for Xapian-enabled instances. Disclaimer: while public-inbox supports NNTP and stable article serial numbers, use of those for public links is discouraged since it encourages centralization. --- scripts/xhdr-num2mid | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'scripts') diff --git a/scripts/xhdr-num2mid b/scripts/xhdr-num2mid index f1e7ea34..bc3ede60 100755 --- a/scripts/xhdr-num2mid +++ b/scripts/xhdr-num2mid @@ -5,8 +5,18 @@ use strict; use warnings; use Net::NNTP; -use Data::Dumper; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); my $usage = "usage: NNTPSERVER=news.example.org $0 GROUP [FIRST_NUM]\n"; +my ($msgmap, $mm); +my %opts = ( '--msgmap=s' => \$msgmap ); +GetOptions(%opts) or die "bad command-line args\n$usage"; + +if ($msgmap) { + require PublicInbox::Msgmap; + require PublicInbox::MID; # mid_clean + $mm = PublicInbox::Msgmap->new_file($msgmap, 1); +} + my $group = shift or die $usage; my $nntp = Net::NNTP->new($ENV{NNTPSERVER} || '127.0.0.1'); my ($num, $first, $last) = $nntp->group($group); @@ -15,16 +25,29 @@ my $arg_first = shift; if (defined $arg_first) { $arg_first =~ /\A\d+\z/ or die $usage; $first = $arg_first; +} elsif ($mm) { + my $last_article = $mm->meta_accessor('last_article'); + $first = $last_article + 1 if defined $last_article; } my $batch = 1000; my $i; for ($i = $first; $i < $last; $i += $batch) { - my $j = $i + $batch; + my $j = $i + $batch - 1; $j = $last if $j > $last; my $num2mid = $nntp->xhdr('Message-ID', "$i-$j"); + + $mm->{dbh}->begin_work if $mm; for my $n ($i..$j) { defined(my $mid = $num2mid->{$n}) or next; print "$n $mid\n"; + if ($mm) { + $mid = PublicInbox::MID::mid_clean($mid); + $mm->mid_set($n, $mid); + } + } + if ($mm) { + $mm->meta_accessor('last_article', $j); + $mm->{dbh}->commit; } } -- cgit v1.2.3-24-ge0c7