# HG changeset patch # User Paul Boddie # Date 1370882435 -7200 # Node ID 3f0fbef873932cee301991cdd9bec7e4cb903eca # Parent 4a10cbd14a491bc903e2e3607f004067acc1a931 Supported table recognition in region extraction in order to handle sections within tables, where the appearance of sections would break up tables around those sections. diff -r 4a10cbd14a49 -r 3f0fbef87393 tests/test_tables_sections_mixed.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_tables_sections_mixed.txt Mon Jun 10 18:40:35 2013 +0200 @@ -0,0 +1,21 @@ +Here are some examples of possible regular expressions and matching lines: +|| Regular expression || Matching lines || +| zuff | Keywords: zuff | +| zuff | Keywords: ZUFF | +| zuff | Keywords: Zuff | +| zuff | Keywords: amaryllis, zuff, applesauce | +| zuff | Subject: \[zuff\] Do you have the right stuff for zuff? | +| zuff | Subject: Do you have the right stuff for zuff? | +| zuff | Subject: What is zuff? | +| {noformat:nopanel=true} + \[zuff\] +{noformat} | Keywords: \[zuff\] | +| {noformat:nopanel=true} + \[zuff\] +{noformat} | Subject: \[zuff\] Do you have the right stuff? | +| {noformat:nopanel=true} + \[zuff\] +{noformat} | Subject: Online zuff tutorials (was Re: \[zuff\] What is zuff?) | +A few notes: +* The matching is case-insensitive, so if zuff matches, so will ZUFF, zuFF, and any other variations in capitalization. +* Some characters have special meaning in a regular expression, so to match those characters specifically, they must be "escaped" with a backslash (). As you can see in the above example, \[ and \] are such characters. (Others include ".", "?", and "*"). The backslash is also used for other things (I wasn't kidding about regular expressions being complex: consult other documentation for details about other uses of the backslash character), but this is the most likely use in a topic expression. diff -r 4a10cbd14a49 -r 3f0fbef87393 wikiparser.py --- a/wikiparser.py Mon Jun 10 13:45:29 2013 +0200 +++ b/wikiparser.py Mon Jun 10 18:40:35 2013 +0200 @@ -39,7 +39,7 @@ # Section extraction. -sections_regexp_str = r"(?[^-_*+{}\n:]+)(?P:[^}\n]+)?}" +sections_regexp_str = r"(?[^-_*+{}\n:]+)(?P:[^}\n]+)?}|^(?P[|]{1,2})|(?P[|]{1,2})(\n|$)" sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) def get_regions(s): @@ -52,11 +52,13 @@ last = 0 regions = [""] depth = 0 + had_row = False for match in sections_regexp.finditer(s): start, end = match.span() - is_start = match.group("options") + is_start = match.group("options") or match.group("rowstart") is_section = is_section_marker(match.group("type")) + is_row = match.group("rowstart") or match.group("rowend") # The start of a region is either indicated by a marker with options or # by a marker where no region is currently active. @@ -74,6 +76,16 @@ if is_section: regions.append(s[start:end]) + # A new row may either continue a table region or start a new + # table region. + + elif is_row: + if (last != start or not had_row): + regions.append(s[start:end]) + else: + regions[-2] += regions[-1] + s[start:end] + regions.pop() + # Certain markers may be standalone macros. else: @@ -85,7 +97,7 @@ else: regions[-1] += s[last:end] - if is_section: + if is_section or is_row: depth += 1 # The end of a region is indicated by a marker with no options. @@ -106,7 +118,7 @@ # current region and the details of the region are then obtained. else: - if depth > 1 or not is_section: + if depth > 1 or (not is_section and not is_row): regions[-1] += s[last:end] # Terminate the active region, interpreting its contents. @@ -115,9 +127,10 @@ regions[-1] += s[last:end] regions.append("") - if is_section: + if is_section or is_row: depth -= 1 + had_row = is_row last = end # Where a region is still active, terminate it.