From b0cc5442f2ef1e3edbdcf4bb94f8fdcd16852e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Fri, 6 Jun 2025 14:17:34 +0200 Subject: [PATCH] Scripts: Fix extract_classes.py to better handle XML indentation (#26) No need to special case codeblocks, we can just rely on textwrap.dedent and keep the original element text verbatim. --- scripts/extract_classes.py | 78 ++++---------------------------------- 1 file changed, 7 insertions(+), 71 deletions(-) diff --git a/scripts/extract_classes.py b/scripts/extract_classes.py index 1decbc0..3f4ada0 100644 --- a/scripts/extract_classes.py +++ b/scripts/extract_classes.py @@ -3,6 +3,7 @@ import argparse import os import shutil +import textwrap from collections import OrderedDict EXTRACT_ATTRIBS = ["deprecated", "experimental"] @@ -180,69 +181,6 @@ def _collect_classes_file(path, classes): print_error("Unknown XML file {}, skipping".format(path)) -## regions are list of tuples with size 3 (start_index, end_index, indent) -## indication in string where the codeblock starts, ends, and it's indent -## if i inside the region returns the indent, else returns -1 -def _get_xml_indent(i, regions): - for region in regions: - if region[0] < i < region[1]: - return region[2] - return -1 - - -## find and build all regions of codeblock which we need later -def _make_codeblock_regions(desc, path=""): - code_block_end = False - code_block_index = 0 - code_block_regions = [] - while not code_block_end: - code_block_index = desc.find("[codeblock]", code_block_index) - if code_block_index < 0: - break - xml_indent = 0 - while True: - ## [codeblock] always have a trailing new line and some tabs - ## those tabs are belongs to xml indentations not code indent - if desc[code_block_index + len("[codeblock]\n") + xml_indent] == "\t": - xml_indent += 1 - else: - break - end_index = desc.find("[/codeblock]", code_block_index) - if end_index < 0: - print_error("Non terminating codeblock: {}".format(path)) - exit(1) - code_block_regions.append((code_block_index, end_index, xml_indent)) - code_block_index += 1 - return code_block_regions - - -def _strip_and_split_desc(desc, code_block_regions): - desc_strip = "" ## a stripped desc msg - total_indent = 0 ## code indent = total indent - xml indent - for i in range(len(desc)): - c = desc[i] - if c == "\n": - c = "\\n" - if c == '"': - c = '\\"' - if c == "\\": - c = "\\\\" ## is invalid for msgmerge - if c == "\t": - xml_indent = _get_xml_indent(i, code_block_regions) - if xml_indent >= 0: - total_indent += 1 - if xml_indent < total_indent: - c = "\\t" - else: - continue - else: - continue - desc_strip += c - if c == "\\n": - total_indent = 0 - return desc_strip - - def _c_escape(string): result = "" for i in range(len(string)): @@ -289,9 +227,9 @@ def _make_translation_catalog(classes): continue line_no = elem._start_line_number if elem_text[0] != "\n" else elem._start_line_number + 1 - desc_str = elem_text.strip() - code_block_regions = _make_codeblock_regions(desc_str, desc_list.path) - desc_msg = _strip_and_split_desc(desc_str, code_block_regions) + # The magic happens here, we remove XML indentation (keeping potential indentation in code blocks), + # and escape special characters. The actual clean POT format with wrapping is handled later with msgmerge. + desc_msg = _c_escape(textwrap.dedent(elem_text).strip()) desc_obj = Desc(line_no, desc_msg, desc_list) desc_list.list.append(desc_obj) @@ -329,11 +267,9 @@ def _generate_translation_catalog_file(unique_msgs, output, location_line=False) f.write('msgid "{}"\n'.format(msg)) f.write('msgstr ""\n\n') - ## TODO: what if 'nt'? - if os.name == "posix": - print("Wrapping template at 80 characters for compatibility with Weblate.") - os.system("msgmerge -w80 {0} {0} > {0}.wrap".format(output)) - shutil.move("{}.wrap".format(output), output) + print("Wrapping template at 80 characters for compatibility with Weblate.") + os.system("msgmerge -w80 {0} {0} > {0}.wrap".format(output)) + shutil.move("{}.wrap".format(output), output) def main():