git@vger.kernel.org mailing list mirror (one of many)
 help / color / mirror / code / Atom feed
* [PATCH/RFC] file import functionality for git-remote-mw
@ 2012-06-04 19:15 Pavel Volek
  2012-06-04 20:54 ` Matthieu Moy
  0 siblings, 1 reply; 2+ messages in thread
From: Pavel Volek @ 2012-06-04 19:15 UTC (permalink / raw)
  To: git
  Cc: Volek Pavel, Pavel Volek, NGUYEN Kim Thuat,
	ROUCHER IGLESIAS Javier, Matthieu Moy

From: Volek Pavel <me@pavelvolek.cz>

The current version of the git-remote-mediawiki supports only import and export of the pages, doesn't support import and export of file attachements which are also exposed by MediaWiki API. This patch adds the functionality to import the last versions of the files and all versions of description pages for these files.

Signed-off-by: Pavel Volek <Pavel.Volek@ensimag.imag.fr>
Signed-off-by: NGUYEN Kim Thuat <Kim-Thuat.Nguyen@ensimag.imag.fr>
Signed-off-by: ROUCHER IGLESIAS Javier <roucherj@ensimag.imag.fr>
Signed-off-by: Matthieu Moy <Matthieu.Moy@imag.fr>
---
 contrib/mw-to-git/git-remote-mediawiki | 128 +++++++++++++++++++++++++++++++--
 1 file changed, 123 insertions(+), 5 deletions(-)

diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki
index c18bfa1..4168218 100755
--- a/contrib/mw-to-git/git-remote-mediawiki
+++ b/contrib/mw-to-git/git-remote-mediawiki
@@ -267,6 +267,39 @@ sub get_mw_pages {
 	return values(%pages);
 }
 
+sub get_mw_media_pages {
+	mw_connect_maybe();
+
+	my %pages; # hash on page titles to avoid duplicates
+
+	# get all pages for mediafiles (they are in different namespace)
+	# only one namespace can be queried at the same moment
+	my $mw_pages = $mediawiki->list({
+		action => 'query',
+		list => 'allpages',
+		apnamespace => get_mw_namespace_id("File"),
+		aplimit => 500,
+	});
+	if (!defined($mw_pages)) {
+		print STDERR "fatal: could not get the list of media file pages.\n";
+		print STDERR "fatal: '$url' does not appear to be a mediawiki\n";
+		print STDERR "fatal: make sure '$url/api.php' is a valid page.\n";
+		exit 1;
+	}
+	foreach my $page (@{$mw_pages}) {
+		$pages{$page->{title}} = $page;
+	}
+	return values(%pages);
+}
+
+sub get_all_mw_pages() {
+	my @pages = get_mw_pages();
+	my @media_pages = get_mw_media_pages();
+	push(@pages,@media_pages);
+
+	return @pages;
+}
+
 sub run_git {
 	open(my $git, "-|:encoding(UTF-8)", "git " . $_[0]);
 	my $res = do { local $/; <$git> };
@@ -300,7 +333,7 @@ my %basetimestamps;
 sub get_last_remote_revision {
 	mw_connect_maybe();
 
-	my @pages = get_mw_pages();
+	my @pages = get_all_mw_pages();
 
 	my $max_rev_num = 0;
 
@@ -403,6 +436,25 @@ sub mw_option {
 	print STDOUT "unsupported\n";
 }
 
+# Returns MediaWiki id for a canonical namespace name. Ex.: "File", "Project".
+sub get_mw_namespace_id() {
+	mw_connect_maybe();
+	my $name = shift;
+	my $query = {
+		action => 'query',
+		meta => 'siteinfo',
+		siprop => 'namespaces',
+	};
+	my $result = $mediawiki->api($query);
+
+	while (my ($id, $ns) = each(%{$result->{query}->{namespaces}})) {
+		if (defined($ns->{canonical}) && ($ns->{canonical} eq $name)) {
+			return $ns->{id};
+		}
+	}
+	die "Namespace $name was not found on MediaWiki.";
+}
+
 sub fetch_mw_revisions_for_page {
 	my $page = shift;
 	my $id = shift;
@@ -461,11 +513,37 @@ sub fetch_mw_revisions {
 	return ($n, @revisions);
 }
 
+sub download_mw_mediafile {
+	my $filename = shift;
+
+	$mediawiki->{config}->{files_url} = $url;
+
+	my $file = $mediawiki->download( { title => $filename } )
+			|| die 'Fatal: Error ' .
+				$mediawiki->{error}->{code} .
+				' from mediwiki: ' . $mediawiki->{error}->{details};
+	if (!defined($file)){
+		print STDERR "\tFile \'$filename\' could not be downloaded.\n";
+		exit 1;
+	} elsif ($file eq "") {
+		print STDERR "\tFile \'$filename\' does not exist on the wiki.\n";
+		exit 1;
+	} else {
+		return $file;
+	}
+}
+
 sub import_file_revision {
 	my $commit = shift;
 	my %commit = %{$commit};
 	my $full_import = shift;
 	my $n = shift;
+	my $mediafile_import = shift;
+	my $mediafile;my %mediafile;
+	if ($mediafile_import) {
+		$mediafile = shift;
+		%mediafile = %{$mediafile};
+	}
 
 	my $title = $commit{title};
 	my $comment = $commit{comment};
@@ -485,6 +563,10 @@ sub import_file_revision {
 	if ($content ne DELETED_CONTENT) {
 		print STDOUT "M 644 inline $title.mw\n";
 		literal_data($content);
+		if ($mediafile_import) {
+			print STDOUT "M 644 inline $mediafile{title}\n";
+			literal_data($mediafile{content});
+		}
 		print STDOUT "\n\n";
 	} else {
 		print STDOUT "D $title.mw\n";
@@ -547,7 +629,7 @@ sub mw_import_ref {
 
 	mw_connect_maybe();
 
-	my @pages = get_mw_pages();
+	my @pages = get_all_mw_pages();
 
 	print STDERR "Searching revisions...\n";
 	my $last_local = get_last_local_revision();
@@ -580,6 +662,7 @@ sub mw_import_ref {
 
 		$n++;
 
+		my $page_title = $result->{query}->{pages}->{$pagerevid->{pageid}}->{title};
 		my %commit;
 		$commit{author} = $rev->{user} || 'Anonymous';
 		$commit{comment} = $rev->{comment} || '*Empty MediaWiki Message*';
@@ -596,9 +679,44 @@ sub mw_import_ref {
 		}
 		$commit{date} = DateTime::Format::ISO8601->parse_datetime($last_timestamp);
 
-		print STDERR "$n/", scalar(@revisions), ": Revision #$pagerevid->{revid} of $commit{title}\n";
-
-		import_file_revision(\%commit, ($fetch_from == 1), $n);
+		# differentiates classic pages and media pages
+		my @prefix = split (":",$page_title);
+		if ($prefix[0] eq "File" || $prefix[0] eq "Image") {
+			# check if there is a corresponding mediafile with the same timestamp => it is page
+			# for new verion of the file (not only for new version of the description of the file)
+			# => download corresponding file version
+			$query = {
+				action => 'query',
+				prop => 'imageinfo',
+				titles => $page_title,
+				iistart => $rev->{timestamp},
+				iiend => $rev->{timestamp},
+				iiprop => 'timestamp|archivename',
+				iilimit => 1,
+			};
+			$result = $mediawiki->api($query);
+
+			my ($imageid,$imageinfo) = each ( %{$result->{query}->{pages}} );
+			# page has a related version of the file
+			if (defined($imageinfo->{imageinfo})) {
+				foreach ( @{$imageinfo->{imageinfo}} ) {
+					my %mediafile;
+					if ($prefix[0] eq "File") { $mediafile{title} = substr $page_title, 5; }
+					else { $mediafile{title} = substr $page_title, 6; }
+
+					$mediafile{content} = download_mw_mediafile("File:".$mediafile{title});
+					print STDERR "$n/", scalar(@revisions), ": Revision #$pagerevid->{revid} of $commit{title}\n";
+					import_file_revision(\%commit, ($fetch_from == 1), $n, 1, \%mediafile);
+				}
+			# page has no related version of the file, do commit only for the page
+			} else {
+				print STDERR "$n/", scalar(@revisions), ": Revision #$pagerevid->{revid} of $commit{title}\n";
+				import_file_revision(\%commit, ($fetch_from == 1), $n, 0);
+			}
+		} else {
+			print STDERR "$n/", scalar(@revisions), ": Revision #$pagerevid->{revid} of $commit{title}\n";
+			import_file_revision(\%commit, ($fetch_from == 1), $n, 0);
+		}
 	}
 
 	if ($fetch_from == 1 && $n == 0) {
-- 
1.7.10.2.552.gaa3bb87

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH/RFC] file import functionality for git-remote-mw
  2012-06-04 19:15 [PATCH/RFC] file import functionality for git-remote-mw Pavel Volek
@ 2012-06-04 20:54 ` Matthieu Moy
  0 siblings, 0 replies; 2+ messages in thread
From: Matthieu Moy @ 2012-06-04 20:54 UTC (permalink / raw)
  To: Pavel Volek; +Cc: git, Volek Pavel, NGUYEN Kim Thuat, ROUCHER IGLESIAS Javier

Pavel Volek <Pavel.Volek@ensimag.imag.fr> writes:

> +sub get_mw_media_pages {
> +	mw_connect_maybe();
> +
> +	my %pages; # hash on page titles to avoid duplicates
> +
> +	# get all pages for mediafiles (they are in different namespace)
> +	# only one namespace can be queried at the same moment
> +	my $mw_pages = $mediawiki->list({
> +		action => 'query',
> +		list => 'allpages',
> +		apnamespace => get_mw_namespace_id("File"),
> +		aplimit => 500,
> +	});

This seems to be done unconditionally. Is this reasonable if the user
has explicitely set remote.origin.pages or remote.origin.categories?

Actually, shouldn't this be added to get_mw_pages, next to the code
dealing with these two variables? Perhaps the function should be
split into multiple functions, along the lines of:

sub get_mw_pages {
	mw_connect_maybe();

        my %pages;
	if (@tracked_pages) {
		$user_defined = 1;
		get_mw_tracked_pages(\%pages);
	}
	if (@tracked_categories) {
		$user_defined = 1;
		get_mw_tracked_categories(\%pages);
	}
	if (!$user_defined) {
		get_mw_all_pages(\%pages);
	}
	return values(%pages);
}

And your code would need to take these 3 options into account.

> +sub get_all_mw_pages() {
> +	my @pages = get_mw_pages();
> +	my @media_pages = get_mw_media_pages();
> +	push(@pages,@media_pages);

Space after comma.

> +# Returns MediaWiki id for a canonical namespace name. Ex.: "File", "Project".
> +sub get_mw_namespace_id() {
> +	mw_connect_maybe();
> +	my $name = shift;
> +	my $query = {
> +		action => 'query',
> +		meta => 'siteinfo',
> +		siprop => 'namespaces',
> +	};
> +	my $result = $mediawiki->api($query);

It may make sense to cache the result, to avoid querying the API
multiple times if you call the function more than once. We can even
cache this in a configuration variable as the namespace identifiers are
unlikely to change for a given wiki.

> +	if (!defined($file)){

Space between ) and { please.

> +		my @prefix = split (":",$page_title);

Space after , please.

> +		if ($prefix[0] eq "File" || $prefix[0] eq "Image") {
> +			# check if there is a corresponding mediafile with the same timestamp => it is page
> +			# for new verion of the file (not only for new version of the description of the file)

> +			# => download corresponding file version

Don't make long lines like this. In general, we avoid lines longer than
80 characters (or even a bit less), these are >100 and the following are
worse.

Long lines are usually an indication that you did not structure your
code into functions, and this diagnosis seems to apply here.

> +			my ($imageid,$imageinfo) = each ( %{$result->{query}->{pages}} );

Space after ",".

-- 
Matthieu Moy
http://www-verimag.imag.fr/~moy/

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2012-06-04 20:54 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-04 19:15 [PATCH/RFC] file import functionality for git-remote-mw Pavel Volek
2012-06-04 20:54 ` Matthieu Moy

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mirrors/git.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).