Josh Pratt | f4dee35 | 2018-07-30 17:04:49 +1000 | [diff] [blame^] | 1 | """A set of helper functions for parsing text.""" |
| 2 | |
| 3 | import re |
| 4 | |
| 5 | NEWLINE = '\n' |
| 6 | WHITESPACE_RE = re.compile(r'\w+') |
| 7 | |
| 8 | |
| 9 | def remove_empty_lines(text): |
| 10 | """Removes empty lines from text (preserving trailing whitespace).""" |
| 11 | non_empty_lines = [] |
| 12 | for line in text.split(NEWLINE): |
| 13 | if not WHITESPACE_RE.match(line): |
| 14 | non_empty_lines.append(line) |
| 15 | return NEWLINE.join(non_empty_lines) |
| 16 | |
| 17 | |
| 18 | def whitespace(text): |
| 19 | """Removes leading whitespace from a string. |
| 20 | |
| 21 | Args: |
| 22 | text(string): Text to remove whitespace from. |
| 23 | Returns: |
| 24 | A tuple containing the leading whitespace and the remaining text. |
| 25 | """ |
| 26 | match = WHITESPACE_RE.match(text) |
| 27 | if match: |
| 28 | return (match.group(), text[match.end():]) |
| 29 | return ('', text) |
| 30 | |
| 31 | |
| 32 | def until(text, suffixes): |
| 33 | """Splits text at the first suffix or 'end token'. |
| 34 | |
| 35 | Args: |
| 36 | text(string): a string to remove prefixed whitespace from. |
| 37 | suffixes(List[string]): a list of strings that mark the end of a block. |
| 38 | Returns: |
| 39 | A tuple containing the text before the end token, the end token and any |
| 40 | remaining, unprocessed, text. |
| 41 | """ |
| 42 | # Convert the search for each suffix into a single regex |
| 43 | pattern = '({})'.format(r'|'.join(map(re.escape, suffixes))) |
| 44 | match = re.search(pattern, text) |
| 45 | |
| 46 | if match: |
| 47 | # If we find a suffix, split the text around it. |
| 48 | chunk = text[:match.start()] |
| 49 | suffix = match.group() |
| 50 | text = text[match.end():] |
| 51 | return chunk, suffix, text |
| 52 | # If no suffix is found, consume the whole string. |
| 53 | return text, '', '' |