paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@8 | 3 | """ |
paul@8 | 4 | Confluence XML dump conversion to a MoinMoin-compatible representation. |
paul@8 | 5 | |
paul@144 | 6 | Copyright (C) 2012, 2013, 2017 Paul Boddie <paul@boddie.org.uk> |
paul@8 | 7 | |
paul@8 | 8 | This software is free software; you can redistribute it and/or |
paul@8 | 9 | modify it under the terms of the GNU General Public License as |
paul@8 | 10 | published by the Free Software Foundation; either version 2 of |
paul@8 | 11 | the License, or (at your option) any later version. |
paul@8 | 12 | |
paul@8 | 13 | This software is distributed in the hope that it will be useful, |
paul@8 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@8 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@8 | 16 | GNU General Public License for more details. |
paul@8 | 17 | |
paul@8 | 18 | You should have received a copy of the GNU General Public |
paul@8 | 19 | License along with this library; see the file LICENCE.txt |
paul@8 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@8 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@8 | 22 | """ |
paul@8 | 23 | |
paul@40 | 24 | from os import chdir, getcwd, listdir, mkdir, makedirs, walk |
paul@40 | 25 | from os.path import exists, extsep, join, split, splitext |
paul@0 | 26 | from zipfile import ZipFile |
paul@0 | 27 | from cStringIO import StringIO |
paul@40 | 28 | from MoinMoin import wikiutil |
paul@0 | 29 | import codecs |
paul@0 | 30 | import xmlread |
paul@35 | 31 | import wikiparser, xmlparser |
paul@25 | 32 | import sys |
paul@123 | 33 | import time, calendar |
paul@0 | 34 | |
paul@84 | 35 | from common import get_page_title |
paul@23 | 36 | |
paul@123 | 37 | def date_to_seconds(s): |
paul@123 | 38 | return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S")) |
paul@123 | 39 | |
paul@0 | 40 | class ConfluenceHandler: |
paul@0 | 41 | |
paul@0 | 42 | "Handle content from a Confluence Wiki dump." |
paul@0 | 43 | |
paul@13 | 44 | def __init__(self, space, no_translate=False): |
paul@0 | 45 | self.content = {} |
paul@0 | 46 | self.elements = [] |
paul@12 | 47 | self.space = space |
paul@13 | 48 | self.no_translate = no_translate |
paul@0 | 49 | |
paul@0 | 50 | def handle_object(self, name, elements, attributes, all_text, text): |
paul@0 | 51 | |
paul@40 | 52 | """ |
paul@40 | 53 | Handle objects according to type. Objects appear as follows: |
paul@40 | 54 | |
paul@40 | 55 | <object class="Page" package="..."> |
paul@40 | 56 | <id name="id">...</id> |
paul@40 | 57 | ... |
paul@40 | 58 | </object> |
paul@40 | 59 | |
paul@40 | 60 | Within objects, one finds things like properties and collections, which |
paul@40 | 61 | are handled by their own methods but which are stored in the content |
paul@40 | 62 | dictionary associated with the current object. |
paul@40 | 63 | |
paul@40 | 64 | By the time this method is called, the contents of the object will have |
paul@40 | 65 | been gathered and the properties and collections populated in the |
paul@40 | 66 | content dictionary. Any identifier will have been assigned to the |
paul@40 | 67 | textual content of the object element and will be available in the |
paul@40 | 68 | 'text' parameter. |
paul@40 | 69 | """ |
paul@0 | 70 | |
paul@0 | 71 | objecttype = attributes[-1]["class"] |
paul@25 | 72 | |
paul@25 | 73 | # Any identifier is stored as the object's textual content. |
paul@25 | 74 | |
paul@0 | 75 | identifier = text.strip() |
paul@25 | 76 | |
paul@25 | 77 | # The content is a dictionary mapping names to properties and |
paul@25 | 78 | # collections. |
paul@25 | 79 | |
paul@0 | 80 | content = self.content |
paul@0 | 81 | |
paul@12 | 82 | pages_dir = join(self.space, "pages") |
paul@12 | 83 | versions_dir = join(self.space, "versions") |
paul@0 | 84 | |
paul@0 | 85 | # Handle particular types. |
paul@0 | 86 | |
paul@10 | 87 | if objecttype in ("Page", "Comment", "BlogPost"): |
paul@0 | 88 | |
paul@0 | 89 | # Handle pages and revisions, adding revisions to the page manifest. |
paul@9 | 90 | # The original version is used as a unifying identifier for all the |
paul@9 | 91 | # different revisions (each of which being defined by a Page |
paul@9 | 92 | # element). Although "original" implies the first identifier used, |
paul@9 | 93 | # it actually appears to be the latest and will have the highest |
paul@9 | 94 | # version number. |
paul@0 | 95 | |
paul@0 | 96 | if content.has_key("originalVersion"): |
paul@0 | 97 | pageid = content["originalVersion"] |
paul@0 | 98 | else: |
paul@0 | 99 | pageid = identifier |
paul@0 | 100 | |
paul@0 | 101 | versionfile = join(versions_dir, identifier) |
paul@0 | 102 | |
paul@0 | 103 | # Note page metadata, not necessarily in the correct order. |
paul@9 | 104 | # For comments, the title will need to be rewritten, since they |
paul@9 | 105 | # should be defined in terms of their owner page. |
paul@0 | 106 | |
paul@53 | 107 | # NOTE: This only makes the current title available to comments. |
paul@53 | 108 | |
paul@0 | 109 | mkdirs(join(pages_dir, pageid)) |
paul@0 | 110 | |
paul@12 | 111 | title = content["title"] |
paul@23 | 112 | |
paul@23 | 113 | # Limit the title to a "safe" number of characters in order to avoid |
paul@23 | 114 | # filesystem issues. |
paul@23 | 115 | |
paul@84 | 116 | title = get_page_title(title) |
paul@23 | 117 | |
paul@12 | 118 | if title: |
paul@12 | 119 | title = "%s/%s" % (self.space, title) |
paul@31 | 120 | write(join(pages_dir, pageid, "pagetitle"), title) |
paul@12 | 121 | |
paul@100 | 122 | # Note the type of the page. |
paul@100 | 123 | |
paul@100 | 124 | write(join(pages_dir, pageid, "pagetype"), objecttype) |
paul@100 | 125 | |
paul@28 | 126 | # See sort_manifest for access to this data. |
paul@28 | 127 | |
paul@24 | 128 | append(join(pages_dir, pageid, "manifest"), |
paul@123 | 129 | "%s|AddRevision|_|%s|%s|%s|%s|%d\n" % ( # blank added for consistency with AddAttachment |
paul@24 | 130 | content["version"], |
paul@24 | 131 | versionfile, |
paul@31 | 132 | title, # comment titles will incorporate the comment's position |
paul@144 | 133 | content.get("lastModifierName") or content.get("lastModifier"), |
paul@123 | 134 | content["versionComment"], |
paul@123 | 135 | date_to_seconds(content["lastModificationDate"]) |
paul@24 | 136 | )) |
paul@0 | 137 | |
paul@24 | 138 | # Add information to parent pages for child page lists. |
paul@24 | 139 | |
paul@24 | 140 | if content.has_key("parent"): |
paul@24 | 141 | parentid = content["parent"] |
paul@24 | 142 | mkdirs(join(pages_dir, parentid)) |
paul@24 | 143 | append(join(pages_dir, parentid, "children"), title + "\n") |
paul@24 | 144 | |
paul@31 | 145 | # Add creation details for comments to the owner page. |
paul@31 | 146 | # Since comments can be versioned, the date of the original version |
paul@31 | 147 | # is used, and only this "original" version has the owner property. |
paul@31 | 148 | |
paul@31 | 149 | if objecttype == "Comment" and content.has_key("owner"): |
paul@31 | 150 | ownerid = content["owner"] |
paul@31 | 151 | mkdirs(join(pages_dir, ownerid)) |
paul@31 | 152 | append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) |
paul@31 | 153 | |
paul@0 | 154 | # Some metadata is not particularly relevant. For example, |
paul@0 | 155 | # ancestors, children, parent are navigation-related. |
paul@0 | 156 | |
paul@0 | 157 | # Other metadata could be added to the page content itself. |
paul@0 | 158 | # For example, labelling could be converted to categories. |
paul@0 | 159 | |
paul@0 | 160 | # Handle revisions. |
paul@0 | 161 | |
paul@0 | 162 | elif objecttype == "BodyContent": |
paul@12 | 163 | body = content["body"] |
paul@12 | 164 | if not body: |
paul@12 | 165 | body = "## Empty page." |
paul@13 | 166 | |
paul@142 | 167 | is_comment_page = content.get("content:class") == "Comment" |
paul@142 | 168 | |
paul@25 | 169 | # NOTE: Very simple technique employed for guessing the format. |
paul@25 | 170 | |
paul@13 | 171 | if no_translate: |
paul@142 | 172 | fn = notranslate |
paul@25 | 173 | elif body.startswith("<"): |
paul@25 | 174 | fn = xmltranslate |
paul@13 | 175 | else: |
paul@13 | 176 | fn = translate |
paul@13 | 177 | |
paul@25 | 178 | try: |
paul@142 | 179 | fn(join(versions_dir, content["content"]), body, is_comment_page) |
paul@25 | 180 | except: |
paul@42 | 181 | err = codecs.getwriter("utf-8")(sys.stderr) |
paul@42 | 182 | print >>err, "Error parsing", content["content"] |
paul@25 | 183 | raise |
paul@0 | 184 | |
paul@40 | 185 | # Handle attachments. |
paul@40 | 186 | |
paul@40 | 187 | elif objecttype == "Attachment": |
paul@144 | 188 | pageid = content.get("content") or content.get("containerContent") |
paul@144 | 189 | version = content.get("attachmentVersion") or content.get("version") or 0 |
paul@40 | 190 | |
paul@40 | 191 | if content.has_key("originalVersion"): |
paul@40 | 192 | attachid = content["originalVersion"] |
paul@40 | 193 | else: |
paul@40 | 194 | attachid = identifier |
paul@40 | 195 | |
paul@40 | 196 | append(join(pages_dir, pageid, "attachments"), |
paul@123 | 197 | "%s|AddAttachment|%s|%s|%s|%s|%s|%d\n" % ( |
paul@40 | 198 | version, |
paul@40 | 199 | # Have to "taint" archive filenames, although Moin will |
paul@40 | 200 | # probably handle package script filename tainting. |
paul@40 | 201 | wikiutil.taintfilename(join("attachments", pageid, attachid, version)), |
paul@144 | 202 | wikiutil.taintfilename(content.get("fileName") or content.get("title")), |
paul@40 | 203 | "", # pagename is substituted later |
paul@144 | 204 | content.get("lastModifierName") or content.get("lastModifier"), |
paul@144 | 205 | content.get("comment") or content.get("versionComment"), |
paul@123 | 206 | date_to_seconds(content["lastModificationDate"]) |
paul@40 | 207 | )) |
paul@40 | 208 | |
paul@0 | 209 | self.content = {} |
paul@0 | 210 | |
paul@0 | 211 | def handle_property(self, name, elements, attributes, all_text, text): |
paul@0 | 212 | |
paul@0 | 213 | "Record properties in the current content dictionary." |
paul@0 | 214 | |
paul@142 | 215 | property_name = attributes[-1]["name"] |
paul@142 | 216 | self.content[property_name] = text.strip() |
paul@142 | 217 | |
paul@142 | 218 | property_class = attributes[-1].get("class") |
paul@142 | 219 | if property_class: |
paul@142 | 220 | self.content["%s:%s" % (property_name, "class")] = property_class.strip() |
paul@0 | 221 | |
paul@0 | 222 | def handle_id(self, name, elements, attributes, all_text, text): |
paul@0 | 223 | |
paul@0 | 224 | "Promote identifiers to the parent element's text." |
paul@0 | 225 | |
paul@0 | 226 | all_text[-2].append(text) |
paul@0 | 227 | |
paul@0 | 228 | def handle_collection(self, name, elements, attributes, all_text, text): |
paul@0 | 229 | |
paul@0 | 230 | "Record collections in the current content dictionary." |
paul@0 | 231 | |
paul@0 | 232 | self.content[attributes[-1]["name"]] = self.elements |
paul@0 | 233 | self.elements = [] |
paul@0 | 234 | |
paul@0 | 235 | def handle_element(self, name, elements, attributes, all_text, text): |
paul@0 | 236 | |
paul@0 | 237 | "Add elements to the current collection." |
paul@0 | 238 | |
paul@0 | 239 | self.elements.append((attributes[-1]["class"], text.strip())) |
paul@0 | 240 | |
paul@0 | 241 | def mkdirs(name): |
paul@2 | 242 | |
paul@2 | 243 | "Make the directory with the given 'name' at any depth." |
paul@2 | 244 | |
paul@0 | 245 | try: |
paul@0 | 246 | makedirs(name) |
paul@0 | 247 | except OSError: |
paul@0 | 248 | pass |
paul@0 | 249 | |
paul@0 | 250 | def append(filename, s): |
paul@2 | 251 | |
paul@2 | 252 | "Append to the file with the given 'filename' the string 's'." |
paul@2 | 253 | |
paul@0 | 254 | write(filename, s, True) |
paul@0 | 255 | |
paul@0 | 256 | def write(filename, s, append=False): |
paul@2 | 257 | |
paul@2 | 258 | """ |
paul@2 | 259 | Write to the file with the given 'filename' the string 's'. If the optional |
paul@2 | 260 | 'append' parameter is set to a true value, 's' will be appended to the file. |
paul@2 | 261 | """ |
paul@2 | 262 | |
paul@2 | 263 | f = codecs.open(filename, append and "a" or "w", encoding="utf-8") |
paul@0 | 264 | try: |
paul@0 | 265 | f.write(s) |
paul@0 | 266 | finally: |
paul@0 | 267 | f.close() |
paul@0 | 268 | |
paul@9 | 269 | def read(filename): |
paul@9 | 270 | |
paul@9 | 271 | """ |
paul@9 | 272 | Read from the file with the given 'filename', returning a string containing |
paul@9 | 273 | its contents. |
paul@9 | 274 | """ |
paul@9 | 275 | |
paul@9 | 276 | f = codecs.open(filename, encoding="utf-8") |
paul@9 | 277 | try: |
paul@9 | 278 | return f.read() |
paul@9 | 279 | finally: |
paul@9 | 280 | f.close() |
paul@3 | 281 | |
paul@142 | 282 | def translate(filename, body, is_comment_page, fn=None): |
paul@11 | 283 | |
paul@11 | 284 | """ |
paul@11 | 285 | Write to the file with the given 'filename' a translation of the given |
paul@11 | 286 | 'body'. |
paul@11 | 287 | """ |
paul@11 | 288 | |
paul@35 | 289 | fn = fn or wikiparser.parse |
paul@25 | 290 | |
paul@11 | 291 | out = codecs.open(filename, "w", encoding="utf-8") |
paul@11 | 292 | try: |
paul@44 | 293 | print >>out, "#pragma page-filename", filename |
paul@142 | 294 | fn(body, out, is_comment_page) |
paul@11 | 295 | finally: |
paul@11 | 296 | out.close() |
paul@11 | 297 | |
paul@142 | 298 | def xmltranslate(filename, body, is_comment_page): |
paul@142 | 299 | translate(filename, body, is_comment_page, xmlparser.parse) |
paul@142 | 300 | |
paul@142 | 301 | def notranslate(filename, body, is_comment_page): |
paul@142 | 302 | write(filename, body) |
paul@25 | 303 | |
paul@31 | 304 | def sort_comments(pages_dir, pageid): |
paul@31 | 305 | |
paul@31 | 306 | """ |
paul@31 | 307 | Where 'pageid' has comments associated with it, sort them chronologically |
paul@31 | 308 | and label the comment pages with the owner page's title and comment's |
paul@31 | 309 | position in the chronological sequence. Such labelling is done by writing |
paul@31 | 310 | a "pagetitle" file in each comment page's directory. |
paul@31 | 311 | """ |
paul@31 | 312 | |
paul@31 | 313 | comments = join(pages_dir, pageid, "comments") |
paul@31 | 314 | |
paul@31 | 315 | if not exists(comments): |
paul@31 | 316 | return |
paul@31 | 317 | |
paul@31 | 318 | title = read(join(pages_dir, pageid, "pagetitle")) |
paul@31 | 319 | |
paul@31 | 320 | details = [line.split("|") for line in read(comments).split("\n") if line] |
paul@31 | 321 | details.sort() |
paul@31 | 322 | |
paul@31 | 323 | # Write the sorted comments list for testing purposes. |
paul@31 | 324 | |
paul@31 | 325 | write(comments, "\n".join(["|".join(x) for x in details])) |
paul@31 | 326 | |
paul@31 | 327 | # Define comments as subpages by setting their titles using this |
paul@31 | 328 | # page's name/title and their position in the comments collection. |
paul@31 | 329 | |
paul@31 | 330 | for position, (_lastmodified, commentid) in enumerate(details): |
paul@31 | 331 | |
paul@31 | 332 | # In the page directory for each comment, write the page title in a |
paul@31 | 333 | # special file for later processing. |
paul@31 | 334 | |
paul@32 | 335 | write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) |
paul@31 | 336 | |
paul@95 | 337 | def _manifest_to_mapping(manifest, output_mapping): |
paul@95 | 338 | |
paul@95 | 339 | """ |
paul@95 | 340 | Open the given 'manifest' and write a mapping from version identifiers to |
paul@95 | 341 | page names/titles to the file with the given 'output_mapping' filename. |
paul@95 | 342 | """ |
paul@95 | 343 | |
paul@95 | 344 | f = codecs.open(manifest, "r", encoding="utf-8") |
paul@95 | 345 | try: |
paul@95 | 346 | mapping = [] |
paul@95 | 347 | |
paul@95 | 348 | lines = [x.split("|") for x in f.readlines()] |
paul@95 | 349 | for line in lines: |
paul@123 | 350 | version, _action, _archive_filename, filename, title, username, comment, mtime = line |
paul@95 | 351 | if title: |
paul@95 | 352 | mapping.append((split(filename)[-1], title)) |
paul@95 | 353 | |
paul@95 | 354 | append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping])) |
paul@95 | 355 | |
paul@95 | 356 | finally: |
paul@95 | 357 | f.close() |
paul@95 | 358 | |
paul@40 | 359 | def _sort_manifest(manifest, title): |
paul@40 | 360 | |
paul@40 | 361 | """ |
paul@40 | 362 | Open the given 'manifest' and sort it according to revision so that it will |
paul@40 | 363 | be added to MoinMoin in the correct order. |
paul@40 | 364 | |
paul@40 | 365 | If a 'title' is provided, the title column in the manifest will be augmented |
paul@40 | 366 | with that information. This is typically done for comments and is necessary |
paul@40 | 367 | for attachments. |
paul@40 | 368 | |
paul@40 | 369 | A list of manifest entries is returned. |
paul@40 | 370 | """ |
paul@40 | 371 | |
paul@40 | 372 | f = codecs.open(manifest, "r", encoding="utf-8") |
paul@40 | 373 | try: |
paul@109 | 374 | lines = [x.rstrip("\n").split("|") for x in f.readlines()] |
paul@40 | 375 | lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) |
paul@40 | 376 | |
paul@40 | 377 | # Reconstruct the lines, optionally changing the titles. |
paul@40 | 378 | |
paul@40 | 379 | result = [] |
paul@40 | 380 | |
paul@40 | 381 | for line in lines: |
paul@123 | 382 | version, _action, _archive_filename, filename, old_title, username, comment, mtime = line |
paul@40 | 383 | |
paul@40 | 384 | # Replace title information with the information already present. |
paul@40 | 385 | |
paul@53 | 386 | if not old_title: |
paul@40 | 387 | new_title = title |
paul@40 | 388 | else: |
paul@40 | 389 | new_title = old_title |
paul@40 | 390 | |
paul@40 | 391 | # The version is omitted now that the manifest is ordered. |
paul@40 | 392 | |
paul@123 | 393 | line = _action, _archive_filename, filename, new_title, username, comment, mtime |
paul@40 | 394 | result.append(line) |
paul@40 | 395 | |
paul@40 | 396 | return result |
paul@40 | 397 | |
paul@40 | 398 | finally: |
paul@40 | 399 | f.close() |
paul@40 | 400 | |
paul@40 | 401 | def serialise_manifest(manifest): |
paul@40 | 402 | |
paul@40 | 403 | """ |
paul@40 | 404 | Process the 'manifest' consisting of entries, removing superfluous columns. |
paul@40 | 405 | """ |
paul@40 | 406 | |
paul@40 | 407 | result = [] |
paul@40 | 408 | |
paul@40 | 409 | for columns in manifest: |
paul@40 | 410 | action = columns[0] |
paul@40 | 411 | if action == "AddRevision": |
paul@40 | 412 | columns = list(columns) |
paul@40 | 413 | del columns[1] |
paul@109 | 414 | result.append("|".join(columns) + "\n") |
paul@40 | 415 | |
paul@40 | 416 | return "".join(result) |
paul@40 | 417 | |
paul@123 | 418 | def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False): |
paul@9 | 419 | |
paul@9 | 420 | """ |
paul@28 | 421 | Using the given 'pageid', locate the manifest for the page and any page |
paul@28 | 422 | title information written to a "pagetitle" file. |
paul@23 | 423 | |
paul@123 | 424 | Then sort the manifest according to revision so that historical operations |
paul@123 | 425 | such as page renaming can be detected. |
paul@28 | 426 | |
paul@28 | 427 | If a "pagetitle" file exists, the title column in the manifest will be |
paul@23 | 428 | augmented with the contents of that file. This is typically done for |
paul@23 | 429 | comments. |
paul@23 | 430 | |
paul@28 | 431 | If a "children" file exists, the pages in that file will be added as a list |
paul@28 | 432 | to the end of each revision's content. |
paul@28 | 433 | |
paul@95 | 434 | If 'output_mapping' is given, a mapping from version identifiers to page |
paul@95 | 435 | titles will be appended to the file having that filename. |
paul@9 | 436 | """ |
paul@9 | 437 | |
paul@100 | 438 | pagetype = join(pages_dir, pageid, "pagetype") |
paul@28 | 439 | manifest = join(pages_dir, pageid, "manifest") |
paul@40 | 440 | attachments = join(pages_dir, pageid, "attachments") |
paul@28 | 441 | pagetitle = join(pages_dir, pageid, "pagetitle") |
paul@28 | 442 | children = join(pages_dir, pageid, "children") |
paul@32 | 443 | comments = join(pages_dir, pageid, "comments") |
paul@28 | 444 | |
paul@100 | 445 | type = exists(pagetype) and read(pagetype) or None |
paul@100 | 446 | |
paul@9 | 447 | if exists(pagetitle): |
paul@9 | 448 | title = read(pagetitle) |
paul@61 | 449 | space, _page_name = get_space_and_name(title) |
paul@9 | 450 | else: |
paul@61 | 451 | title = space = None |
paul@3 | 452 | |
paul@40 | 453 | # Sort the revision manifest. |
paul@40 | 454 | |
paul@40 | 455 | result = _sort_manifest(manifest, title) |
paul@9 | 456 | |
paul@95 | 457 | # Output a mapping of identifiers to page names. |
paul@95 | 458 | |
paul@95 | 459 | if output_mapping: |
paul@95 | 460 | _manifest_to_mapping(manifest, output_mapping) |
paul@95 | 461 | |
paul@95 | 462 | # Modify the content to include child pages and comments. |
paul@95 | 463 | |
paul@109 | 464 | last_title = None |
paul@109 | 465 | final_result = [] |
paul@109 | 466 | |
paul@109 | 467 | for details in result: |
paul@123 | 468 | _action, _archive_filename, filename, new_title, username, comment, mtime = details |
paul@109 | 469 | |
paul@109 | 470 | # Detect renamed pages and add a redirect revision. |
paul@109 | 471 | |
paul@109 | 472 | if last_title and last_title != new_title and _action == "AddRevision": |
paul@109 | 473 | renaming_versionfile = filename + ".rename" |
paul@123 | 474 | final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime)) |
paul@109 | 475 | write(renaming_versionfile, "#REDIRECT %s" % new_title) |
paul@109 | 476 | |
paul@109 | 477 | last_title = new_title |
paul@109 | 478 | |
paul@109 | 479 | # Add this revision to the manifest. |
paul@109 | 480 | |
paul@109 | 481 | final_result.append(details) |
paul@109 | 482 | |
paul@109 | 483 | # Obtain the text only if modifications are to be made. |
paul@109 | 484 | |
paul@109 | 485 | text = None |
paul@100 | 486 | |
paul@100 | 487 | # Add an ACL to comment pages so that people cannot change other |
paul@100 | 488 | # people's comments. |
paul@130 | 489 | # NOTE: This should match the PostComment action. |
paul@100 | 490 | |
paul@100 | 491 | if type == "Comment": |
paul@130 | 492 | text = """\ |
paul@130 | 493 | #acl %s:read,write,delete,revert All:read |
paul@130 | 494 | #pragma comment-owner %s |
paul@130 | 495 | %s""" % (username, username, text or read(filename)) |
paul@9 | 496 | |
paul@40 | 497 | # Add child page information to the content. |
paul@28 | 498 | |
paul@40 | 499 | if exists(children) and not no_translate: |
paul@40 | 500 | child_pages = [] |
paul@40 | 501 | child_page_names = [x for x in read(children).split("\n") if x] |
paul@40 | 502 | child_page_names.sort() |
paul@28 | 503 | |
paul@61 | 504 | # Produce links which hide the space prefix. |
paul@61 | 505 | |
paul@40 | 506 | for child_page_name in child_page_names: |
paul@61 | 507 | child_space, page_name = get_space_and_name(child_page_name) |
paul@61 | 508 | if child_space == space: |
paul@61 | 509 | child_page_label = page_name |
paul@61 | 510 | else: |
paul@61 | 511 | child_page_label = child_page_name |
paul@61 | 512 | |
paul@61 | 513 | child_pages.append(" * [[%s|%s]]" % (child_page_name, child_page_label)) |
paul@28 | 514 | |
paul@109 | 515 | text = (text or read(filename)) + child_page_section % "\n".join(child_pages) |
paul@28 | 516 | |
paul@40 | 517 | # Add comments to the content. |
paul@40 | 518 | |
paul@40 | 519 | if exists(comments) and title and not no_translate: |
paul@109 | 520 | text = (text or read(filename)) + comment_section |
paul@100 | 521 | |
paul@109 | 522 | # Rewrite the file if necessary. |
paul@100 | 523 | |
paul@109 | 524 | if text: |
paul@109 | 525 | write(filename, text) |
paul@28 | 526 | |
paul@40 | 527 | # Add the attachments to the manifest. |
paul@32 | 528 | |
paul@40 | 529 | if exists(attachments): |
paul@109 | 530 | final_result += _sort_manifest(attachments, title) |
paul@32 | 531 | |
paul@123 | 532 | return final_result |
paul@123 | 533 | |
paul@123 | 534 | def sort_final_manifest(entries, output): |
paul@123 | 535 | |
paul@123 | 536 | """ |
paul@123 | 537 | Sort the manifest 'entries' by last modified time and serialise it. |
paul@123 | 538 | The manifest details will be appended to the file named by 'output'. |
paul@123 | 539 | """ |
paul@123 | 540 | |
paul@123 | 541 | # The final entry in each element is the mtime. |
paul@123 | 542 | |
paul@123 | 543 | entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1]))) |
paul@123 | 544 | |
paul@40 | 545 | # Serialise the manifest. |
paul@3 | 546 | |
paul@123 | 547 | s = serialise_manifest(entries) |
paul@123 | 548 | append(output, s) |
paul@3 | 549 | |
paul@61 | 550 | def get_space_and_name(page_name): |
paul@61 | 551 | try: |
paul@61 | 552 | return page_name.split("/", 1) |
paul@61 | 553 | except IndexError: |
paul@61 | 554 | return None, page_name |
paul@61 | 555 | |
paul@28 | 556 | # Template for child page information. |
paul@28 | 557 | |
paul@28 | 558 | child_page_section = """ |
paul@28 | 559 | ---- |
paul@28 | 560 | |
paul@28 | 561 | %s |
paul@28 | 562 | """ |
paul@28 | 563 | |
paul@32 | 564 | # Template for comments. |
paul@32 | 565 | |
paul@32 | 566 | comment_section = """ |
paul@32 | 567 | ---- |
paul@32 | 568 | |
paul@110 | 569 | <<IncludeComments>> |
paul@32 | 570 | """ |
paul@32 | 571 | |
paul@28 | 572 | # Main program. |
paul@28 | 573 | |
paul@0 | 574 | if __name__ == "__main__": |
paul@20 | 575 | try: |
paul@20 | 576 | filename = sys.argv[1] |
paul@20 | 577 | is_zipfile = splitext(filename)[-1] == extsep + "zip" |
paul@20 | 578 | space = sys.argv[2] |
paul@44 | 579 | if len(sys.argv) > 3 and sys.argv[3]: |
paul@40 | 580 | attachments = sys.argv[3] |
paul@40 | 581 | else: |
paul@40 | 582 | attachments = None |
paul@20 | 583 | except IndexError: |
paul@47 | 584 | print >>sys.stderr, """ |
paul@47 | 585 | Please specify an XML file containing Wiki data, a workspace name, and an |
paul@47 | 586 | optional attachments directory location. For example: |
paul@47 | 587 | |
paul@100 | 588 | %(progname)s com_entities.xml COM attachments |
paul@47 | 589 | |
paul@47 | 590 | Adding --no-translate will unpack the Wiki but not translate the content. |
paul@47 | 591 | When doing so without an attachments directory, add an empty argument as |
paul@47 | 592 | follows: |
paul@47 | 593 | |
paul@100 | 594 | %(progname)s com_entities.xml COM '' --no-translate |
paul@100 | 595 | |
paul@100 | 596 | An archive can be used instead of the XML file, and since this may include |
paul@100 | 597 | attachments, no additional attachments directory needs to be specified: |
paul@100 | 598 | |
paul@100 | 599 | %(progname)s COM-123456-789012.zip COM |
paul@100 | 600 | """ % {"progname" : split(sys.argv[0])[-1]} |
paul@100 | 601 | |
paul@20 | 602 | sys.exit(1) |
paul@0 | 603 | |
paul@13 | 604 | no_translate = "--no-translate" in sys.argv |
paul@0 | 605 | |
paul@12 | 606 | if exists(space): |
paul@12 | 607 | print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space |
paul@0 | 608 | sys.exit(1) |
paul@0 | 609 | |
paul@12 | 610 | package_zip = space + extsep + "zip" |
paul@12 | 611 | |
paul@12 | 612 | if exists(package_zip): |
paul@12 | 613 | print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip |
paul@12 | 614 | sys.exit(1) |
paul@12 | 615 | |
paul@12 | 616 | mkdir(space) |
paul@12 | 617 | mkdirs(join(space, "pages")) |
paul@12 | 618 | mkdirs(join(space, "versions")) |
paul@0 | 619 | |
paul@0 | 620 | p = xmlread.ConfigurableParser() |
paul@13 | 621 | handler = ConfluenceHandler(space, no_translate) |
paul@0 | 622 | |
paul@24 | 623 | # Register handlers in the parser for different elements. |
paul@24 | 624 | |
paul@0 | 625 | p["object"] = handler.handle_object |
paul@0 | 626 | p["property"] = handler.handle_property |
paul@0 | 627 | p["id"] = handler.handle_id |
paul@0 | 628 | p["collection"] = handler.handle_collection |
paul@0 | 629 | p["element"] = handler.handle_element |
paul@0 | 630 | |
paul@2 | 631 | # Open the XML dump. |
paul@2 | 632 | |
paul@0 | 633 | f = open(filename) |
paul@0 | 634 | |
paul@0 | 635 | if is_zipfile: |
paul@0 | 636 | zf = ZipFile(f) |
paul@0 | 637 | ff = StringIO(zf.read("entities.xml")) |
paul@0 | 638 | else: |
paul@0 | 639 | ff = f |
paul@0 | 640 | |
paul@2 | 641 | # Parse the data. |
paul@2 | 642 | |
paul@0 | 643 | try: |
paul@0 | 644 | p.parse(ff) |
paul@40 | 645 | |
paul@40 | 646 | # Tidy up the import manifests, sorting each of them by revision and |
paul@40 | 647 | # finalising them. |
paul@40 | 648 | |
paul@40 | 649 | pages_dir = join(space, "pages") |
paul@40 | 650 | |
paul@40 | 651 | for pageid in listdir(pages_dir): |
paul@40 | 652 | sort_comments(pages_dir, pageid) |
paul@40 | 653 | |
paul@95 | 654 | output_mapping = join(space, "MAPPING") |
paul@95 | 655 | |
paul@40 | 656 | output_manifest = join(space, "MOIN_PACKAGE") |
paul@40 | 657 | append(output_manifest, "MoinMoinPackage|1\n") |
paul@40 | 658 | |
paul@123 | 659 | entries = [] |
paul@123 | 660 | |
paul@40 | 661 | for pageid in listdir(pages_dir): |
paul@123 | 662 | entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate) |
paul@123 | 663 | |
paul@123 | 664 | sort_final_manifest(entries, output_manifest) |
paul@40 | 665 | |
paul@40 | 666 | # Write the page package. |
paul@40 | 667 | |
paul@40 | 668 | page_package = ZipFile(package_zip, "w") |
paul@40 | 669 | |
paul@40 | 670 | try: |
paul@40 | 671 | # Include the page revisions. |
paul@40 | 672 | |
paul@40 | 673 | versions_dir = join(space, "versions") |
paul@40 | 674 | |
paul@40 | 675 | for versionid in listdir(versions_dir): |
paul@40 | 676 | page_package.write(join(versions_dir, versionid)) |
paul@40 | 677 | |
paul@40 | 678 | # Include the attachments. |
paul@40 | 679 | |
paul@40 | 680 | if attachments: |
paul@40 | 681 | cwd = getcwd() |
paul@40 | 682 | chdir(split(attachments)[0]) |
paul@40 | 683 | try: |
paul@40 | 684 | for path, dirnames, filenames in walk(split(attachments)[1]): |
paul@40 | 685 | for filename in filenames: |
paul@40 | 686 | # Have to "taint" archive filenames. |
paul@40 | 687 | page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) |
paul@40 | 688 | finally: |
paul@40 | 689 | chdir(cwd) |
paul@40 | 690 | elif is_zipfile: |
paul@40 | 691 | for filename in zf.namelist(): |
paul@40 | 692 | if filename.startswith("attachments"): |
paul@40 | 693 | # Have to "taint" archive filenames. |
paul@40 | 694 | page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) |
paul@40 | 695 | |
paul@40 | 696 | # Include only the top-level manifest. |
paul@40 | 697 | |
paul@40 | 698 | page_package.write(output_manifest, "MOIN_PACKAGE") |
paul@40 | 699 | |
paul@40 | 700 | finally: |
paul@40 | 701 | page_package.close() |
paul@40 | 702 | |
paul@0 | 703 | finally: |
paul@0 | 704 | f.close() |
paul@0 | 705 | |
paul@0 | 706 | # vim: tabstop=4 expandtab shiftwidth=4 |