# HG changeset patch # User Paul Boddie # Date 1286146987 -7200 # Node ID faec742b3844490c2b49835362a0b89d24100860 # Parent b78423a026624a77a46908c6664efd12957f1972 Added meta tag title and description processing (as titles and introductions respectively). Added elementary support for decoding titles and descriptions so that they become Unicode as soon as they have been extracted. Tidied up the quoting of introductions. diff -r b78423a02662 -r faec742b3844 actions/AddLinkToPage.py --- a/actions/AddLinkToPage.py Sun Aug 15 18:31:26 2010 +0200 +++ b/actions/AddLinkToPage.py Mon Oct 04 01:03:07 2010 +0200 @@ -20,14 +20,58 @@ # Page parsing. -macro_pattern = re.compile(ur'^(?P.*?)<[^\s,)]+).*?\)>>(?P.*)$', +macro_pattern = re.compile( + ur'^(?P.*?)' # leading text on the line + ur'<[^\s,)]+).*?' # identifier + ur'\)>>' # macro epilogue + ur'(?P.*)$', # trailing text on the line re.MULTILINE | re.UNICODE) # Link visiting and parsing. -title_pattern = re.compile(ur'<(?Ptitle|h\d)(\s.*?)?>(?P.*?)</(?P=tag)>', re.MULTILINE | re.UNICODE | re.DOTALL) -paragraph_pattern = re.compile(ur'<p(\s.*?)?>(?P<text>.*?)(?=<p(\s.*?)?>|</p>)', re.MULTILINE | re.UNICODE | re.DOTALL) -tag_pattern = re.compile(ur'<.*?>', re.MULTILINE | re.UNICODE | re.DOTALL) +def attr_pattern(agroup, attrname, qgroup, vgroup): + return ( + ur'''(?P<%s>%s)''' # attribute name + ur'''\s*=\s*''' # = + ur'''(?P<%s>['"])''' # opening quote + ur'''(?P<%s>.*?)''' # value + ur'''(?P=%s)''' # closing quote + % (agroup, attrname, qgroup, vgroup, qgroup) + ) + +meta_pattern = re.compile( + ur'<meta' + ur'([^>]*?' + ur'(' + ur'(' + attr_pattern("nattr", "name", "nquote", "name") + ')' + ur'|(' + attr_pattern("cattr", "content", "cquote", "content") + ')' + ur')' + ur')*' + ur'[^>]*?>', + re.MULTILINE | re.DOTALL) + +title_pattern = re.compile( + ur'<(?P<tag>title|h\d)(\s.*?)?>' + ur'(?P<title>.*?)' + ur'</(?P=tag)>', + re.MULTILINE | re.DOTALL) + +paragraph_pattern = re.compile( + ur'<p(\s.*?)?>' + ur'(?P<text>.*?)' + ur'(?=<p(\s.*?)?>|</p>)', + re.MULTILINE | re.DOTALL) + +tag_pattern = re.compile( + ur'<.*?>', + re.MULTILINE | re.DOTALL) + +def get_text(s): + try: + return unicode(s, "utf-8") + except UnicodeError: + return unicode(s, "iso-8859-1") def get_link_info(link): @@ -42,6 +86,25 @@ try: s = f.read() + + # Look for metadata. + + title = None + intro = None + + for meta_match in meta_pattern.finditer(s): + name = meta_match.group("name") + content = meta_match.group("content") + if name == "title": + title = content + elif name == "description": + intro = content + + if title and intro: + return get_text(title), get_text(intro) + + # Look for titles/headings and accompanying text. + first_title = "" for title_match in title_pattern.finditer(s): @@ -54,11 +117,11 @@ for intro_match in paragraph_pattern.finditer(s[end:]): intro = get_flattened_content(intro_match.group("text")).strip() if intro: - return title, intro + return get_text(title), get_text(intro) finally: f.close() - return first_title, "" + return get_text(first_title), u"" def get_flattened_content(s): @@ -71,7 +134,7 @@ l.append(s[last:start]) last = end l.append(s[last:]) - return "".join(l).replace("\n", " ") + return get_text("".join(l).replace("\n", " ")) # Action class and supporting functions. @@ -116,7 +179,7 @@ "page_url" : wikiutil.quoteWikinameURL(page.page_name) } - html = ''' + html = u''' <form class="macro" method="POST" action="%(script_name)s/%(page_url)s"> <input type="hidden" name="identifier" value="%(identifier)s" /> <input type="hidden" name="doit" value="1" /> @@ -201,7 +264,7 @@ # NOTE: Should support different formatting options. link_details = "%s[[%s%s]]%s" % ( - introduction and (get_verbatim('"%s" ' % introduction)) or "", + introduction and ('"%s" ' % get_verbatim(introduction)) or "", link, title and ('|%s' % title) or "", description and (" - ''%s''" % description) or "" diff -r b78423a02662 -r faec742b3844 macros/AddLinkToPage.py --- a/macros/AddLinkToPage.py Sun Aug 15 18:31:26 2010 +0200 +++ b/macros/AddLinkToPage.py Mon Oct 04 01:03:07 2010 +0200 @@ -37,7 +37,10 @@ # The macro's identifier should always appear first. - identifier = parsed_args[0] + try: + identifier = parsed_args[0] + except IndexError, exc: + return macro.format_error(exc) # Look for keywords determining the action of the macro.