Source code for exegis.aphorisms_to_xml

"""
This module has been written to convert transcribed commentaries from text
files to TEI compatible XML.

Funding is provided by an ERC funded project studying Arabic commentaries on
the Hippocratic Aphorisms. The Principal Investigator is Peter E. Pormann,
The University of Manchester.

It is anticipated the module will be used via the main.py module
which attempts to to process any input file or directory containing files with
a .txt extension.

Each text file base name should end in an underscore followed by a
numerical value, e.g. file_1.txt, file_2.txt, etc. The numerical value is
subsequently used when creating the title section ``<div>`` element, e.g.
``<div n="1" type="Title_section">`` for file_1.txt.

.. note::
    This is optional, by default the version is set at 1.

If processing succeeds two XML files will be created in a folder called XML.
The XML file names start with the text file base name and end in _main.xml (for
the XML files will be file_1_main.xml and file_1_app.xml.

If processing fails error messages will be saved in the exegis.log file.

The commentaries should be utf-8 text files with the format as documented
in the associated documentation (docs/_build/index.html).

:Authors: Jonathan Boyle, Nicolas Gruel <nicolas.gruel@manchester.ac.uk>

:Copyright: IT Services, The University of Manchester
"""
# pylint: disable=locally-disabled, invalid-name
import os
import re
from lxml import etree

try:
    from .analysis import references, footnotes, AnalysisException
    from .introduction import Introduction
    from .title import Title, TitleException
    from .footnotes import Footnotes, FootnotesException
    from .baseclass import Exegis, logger, TEMPLATE_FNAME, RELAXNG_FNAME
except ImportError:
    from analysis import references, footnotes, AnalysisException
    from introduction import Introduction, IntroductionException
    from title import Title, TitleException
    from footnotes import Footnotes, FootnotesException
    from baseclass import Exegis, logger, TEMPLATE_FNAME, RELAXNG_FNAME


# Define an Exception
[docs]class AphorismsToXMLException(Exception):
    """Class for exception
    """
    pass


[docs]class Process(Exegis):
    """Class to main hypocratic aphorism text to produce a TEI XML file.

    Attributes
    ----------
    fname : str
        Name of the file to convert.
        The text file base name is expected to end with an underscore followed
        by a numerical value, e.g. file_1.txt, file_2.txt, etc. This numerical
        value is used when creating the title section <div> element, e.g.
        <div n="1" type="Title_section"> for file_1.txt.

    folder : str, optional
        Name of the folder where are the files to convert

    doc_num : int, optional
        version of the document treated.
        Default value: 1
    """
    def __init__(self,
                 fname=None,
                 folder=None,
                 doc_num=1):

        Exegis.__init__(self)
        self.folder = folder
        self.fname = fname
        self.doc_num = doc_num
        self.template_fname = TEMPLATE_FNAME
        self.relaxng_fname = None

        # Create basename file.
        if self.fname is not None:
            self.set_basename()
        else:
            self.base_name = None

        self.footnotes_app = None

        # Initialise footnote number
        self._next_footnote = 1

        # other attributes used
        self._introduction = ''
        self._title = ''
        self._aph_com = {}  # aphorism and commentaries
        self._text = ''
        self.footnotes = ''
        self._n_footnote = 1
        self.template = ''

        # Initialisation of the xml_main and xml_app list
        # They are created here and not in the __init__ to have
        # the reinitialisation where it is needed.

[docs]    def set_basename(self):
        """Method to set the basename attribute if fname is not None
        """
        self.base_name = os.path.splitext(os.path.basename(self.fname))[0]

        # Create folder for XML
        if not os.path.exists('XML'):
            os.mkdir('XML')

        # Set XML file name
        self.xml_file = os.path.join('XML', self.base_name + '.xml')

[docs]    def open_document(self, fname=None):
        """Method to open and read the exegis document.

        Parameters
        ----------
        fname : str, optional
            name of the file to analyse.

        Attributes
        ----------
        folder : str, optional
            Name of the folder where are the files to convert

        fname : str
            Name of the file to convert.
            The text file base name is expected to end with an underscore
            followed by a numerical value, e.g. file_1.txt, file_2.txt, etc.
            This numerical value is used when creating the title section
            <div> element, e.g. <div n="1" type="Title_section">
            for file_1.txt.

        text : str
            string which contains the whole file in utf-8 format.

        Raises
        ------
        AphorismsToXMLException
            if document can not be:
                - open
                - there subfolder present in the folder
                - file not treatable by the software (e.g. .DS_Store)
                - file does not exist
        """
        if fname is not None:
            self.folder, self.fname = os.path.split(fname)
            self.set_basename()

        if self.base_name is None and self.fname is not None:
            self.set_basename()

        if self.folder is None:
            self.folder = '.'

        if self.base_name is None:
            logger.error("There are no file to convert.")
            raise AphorismsToXMLException

        full_path = os.path.join(self.folder, self.fname)
        if os.path.isdir(full_path):
            logger.info('The software does not treat subfolder.')
            raise AphorismsToXMLException

        # Extract the document number, it is expected this is at the end of the
        # base name following an '_'
        try:
            sep, doc_num = self.base_name.rpartition('_')[1:]
            self.doc_num = int(doc_num)
            if sep == '':
                raise AphorismsToXMLException
        except ValueError:
            info = ('File name {} does not provide version information. '
                    'Use version 1 by default'.format(self.fname))
            logger.info(info)

        # Open the file to process
        # pylint: disable=locally-disabled, invalid-name
        try:
            with open(full_path, 'r', encoding="utf-8") as f:
                # Read in file
                self._text = f.read().strip()
        except UnicodeDecodeError:
            info = ('File {} is not treatable by the software'.format(
                self.fname))
            logger.info(info)
            raise AphorismsToXMLException
        except FileNotFoundError:
            info = ('File {} does not exist'.format(self.fname))
            logger.info(info)
            raise AphorismsToXMLException

[docs]    def divide_document(self):
        """Method to divide the document in the three main parts.

        An exegis document si composed in three or four main parts:

        - The introduction (optional)
        - The title
        - The aphorisms
        - The footnotes

        This method will divide the document in the three or four parts.

        Attributes
        ----------
        _introduction : str
            A string which contains the introduction of the document if present

        _title : str
            A string which contains the title of the document

        _text : str
            A string which contains the aphorisms and commentaries
            of the document

        _footnotes : str
            A string which contains the footnotes of the document

        Raises
        ------
        AphorismsToXMLException
            if it is not possible to divide the document.
        """

        # Not sure that is the best way to do but this is just a trial

        # cut the portion of the test, starting from the end, until the
        # characters footnotes_sep
        footnotes_sep = '*1*'
        loc_footnotes = self._text.rfind(footnotes_sep)

        if loc_footnotes == self._text.find(footnotes_sep):
            logger.error('Footnote referenced in the text but '
                         'no footnote section present.')
            self.footnotes = ''
            raise AphorismsToXMLException

        if loc_footnotes != -1:
            self.footnotes = self._text[loc_footnotes:].strip()
            self._text = self._text[:loc_footnotes]
        else:
            logger.info('There are no footnotes present.')

        # Cut the intro (if present)
        try:
            p = re.compile(r'\+\+\n')
            _tmp = p.split(self._text)
            if len(_tmp) == 3:
                self._title = _tmp[0].strip()
                self._introduction = _tmp[1].strip()
                self._text = _tmp[2].strip()
            elif len(_tmp) == 2:
                self._introduction = _tmp[0].strip()
                self._text = _tmp[1].strip()
        except ValueError as e:
            raise AphorismsToXMLException(e)

        try:
            p = re.compile(r'\n\s{0,}1\.?\n')
            if self._title == '':
                _tmp = p.split(self._text)
                self._title = _tmp[0]
                self._text = '1.\n' + '1.\n'.join(_tmp[1:])
        except ValueError as e:
            logger.error('Aphorism should have numeration as 1. or 1')
            raise AphorismsToXMLException(e)

        return

[docs]    def aphorisms_dict(self):
        """Create an order dictionary (OrderedDict object) with the aphorisms
        and commentaries.

        Attributes
        ----------
        _aph_com : dict
            dictionary which contains the aphorisms and the commentaries
            associated.

        Raises
        ------
        AphorismsToXMLException
            if it is not possible to create the dictionary.
        """
        aphorism = re.split(r'\n\s{0,}[0-9]+\.?\n', '\n' + self._text)[1:]

        # Split the text in function of the numbers (i.e. the separation
        # of the aphorism.
        # '\s[0-9]+.\n' means 'find string :
        #    which start with end of line or any space character
        #    with at least on number ending
        #    with a point and a end of line.
        p = re.compile(r'\n\s{0,}?[0-9]+\.?\n')
        error = ''
        try:
            n_aphorism = [int(i.group().strip('.\t\n '))
                          for i in p.finditer('\n' + self._text)]
            # Find missing aphorism or badly written (e.g.: 14-)
            missing = [i for i in list(range(1, max(n_aphorism)))
                       if i not in n_aphorism]
            # Find if multiple aphorism with the same number.
            doublon = list({i for i in n_aphorism if n_aphorism.count(i) > 1})
            if not n_aphorism:
                error = 'There are no aphorisms detected'
                logger.error(error)
            if max(n_aphorism) != len(n_aphorism):
                error = 'N aphorism expected {}, got: {}'.format(
                    n_aphorism[-1],
                    len(n_aphorism)
                )
                logger.error(error)
            if missing:
                error = 'Missing or problematic aphorism: {}'.format(missing)
                logger.error(error)
                warning = ('Last aphorism can be problematic but '
                           'not detected by the software.')
                logger.warning(warning)
            if doublon:
                error = 'Aphorism with same number: {}'.format(doublon)
                logger.error(error)
            if error:
                raise AphorismsToXMLException(error)
        except ValueError:
            error = ('Aphorism numeration format probably does not respect '
                     'the convention. '
                     'It should be a number following by a point')
            logger.error(error)
            raise AphorismsToXMLException
        except AphorismsToXMLException as e:
            raise AphorismsToXMLException(e)

        # create the dictionary with the aphorism (not sure that we need
        # the ordered one)
        # use n_aphorism to be sure that there are no error

        try:
            self._aph_com = {}
            for i, aph in enumerate(aphorism):
                self._aph_com[n_aphorism[i]] = [s.strip()
                                                for s in aph.split('\n')
                                                if len(s) != 0]
        except (IndexError, AphorismsToXMLException):
            error = ('Problem in the creation of the dictionary which'
                     'which contains the aphorisms')
            logger.error(error)
            raise AphorismsToXMLException

[docs]    def read_template(self):
        """Method to read the XML template used for the transformation

        Attributes
        ----------
        template : str
            Contain the text of the XML template provided.

        Raises
        ------
        AphorismsToXMLException
            if template cannot be found or read.
        """
        # Open the template file. Kill the process if not there.
        # Template is not optional.

        try:
            with open(self.template_fname, 'r', encoding="utf-8") as f:
                self.template = f.read()
                info = 'Template file {} found.'.format(self.template_fname)
                logger.info(info)
        except FileNotFoundError:
            error = 'Template file {} not found.'.format(self.template_fname)
            logger.error(error)
            raise AphorismsToXMLException

        if self.relaxng_fname is None:
            tree = etree.parse(self.template_fname)
            root = tree.getroot()
            model = root.xpath("/processing-instruction('xml-model')")[0]

            self.relaxng_fname = model.text.split('"')[1]

        logger.info('Relaxng file '
                    'use for validation: {} '.format(self.relaxng_fname))

    def _create_xml(self):

        if self.template == '':
            self.read_template()

        xml = self.template

        if self.wits:
            wits = set(self.wits)
            wits = list(wits)
            wits.sort()
            info = 'Witnesses found in the aphorisms and ' \
                   'commentaries {}'.format(wits)
            logger.info(info)
            _wits = []
            for w in wits:
                _wits.append(self.xml_oss * self.xml_n_offset +
                             '<witness> {} </witness>'.format(w))
            xml = re.sub('#INSERTWITNESSES#', '\n'.join(_wits), xml)

        if self.xml:
            xml = re.sub('#INSERTBODY#', '\n'.join(self.xml), xml)
        if self.app:
            xml = re.sub('#INSERTAPP#', '\n'.join(self.app), xml)

        self.xml = xml

    def _validate_xml(self):

        try:
            relaxng_doc = etree.parse(self.relaxng_fname)
        except OSError:
            relaxng_doc = etree.parse(RELAXNG_FNAME)
            self.relaxng_fname = RELAXNG_FNAME

        relaxng = etree.RelaxNG(relaxng_doc)
        xml = etree.parse(self.xml_file)
        # relaxng.validate(xml)
        # if not relaxng(xml):
        #     logger.error("INVALID")
        # else:
        #     logger.error(self.xml_file)
        #     logger.error("VALID")

        try:
            relaxng.assertValid(xml)
            logger.info('The document {} created is '
                        'valid corresponding '
                        'to the Relaxng declared '
                        'or used'.format(self.xml_file))

        except etree.DocumentInvalid:
            logger.error('The document {} created is '
                         'not valid corresponding '
                         'to the Relaxng declared '
                         'or used'.format(self.xml_file))
            raise AphorismsToXMLException

[docs]    def treat_footnotes(self):
        """Method to treat Footnote.

        Work even if division of the document didn't work properly but
        for the footnotes part.
        """
        if not self.footnotes == '':
            # In most of the file the footnote will be present and can be
            # treated independently from the aphorism.

            # Treat the footnote part and create the XML app
            try:
                self.footnotes_app = Footnotes(self.footnotes)
            except FootnotesException:
                raise AphorismsToXMLException from None
            logger.info('Footnotes treated')

            # Create XML app
            self.footnotes_app.xml_app()
            self.app = self.footnotes_app.xml
            self.wits = self.footnotes_app.wits
            logger.info('Footnotes app file created')

[docs]    def main(self):
        """
        A function to process a text file containing symbols representing
        references to witnesses and symbols and footnotes defining textual
        variations, omissions, additions, correxi or conieci. This function
        uses these symbols to produce files containing EpiDoc compatible XML.

        If processing succeeds two XML files will be created in folder ./XML
        with file names that start with the text file base name and ending in
        _main.xml (for the main XML) and _apps.xml (for the apparatus XML).
        For example for file_1.txt the XML files will be file_1_main.xml and
        file_1_app.xml.

        Modify the attribute ``xml`` to add the title section in the main XML

        Raises
        ------
        AphorismsToXMLException
            if the processing of the file does not work as expected.

        """

        # Open and read the exegis document
        self.open_document()

        debug = 'Open document {}'.format(self.fname)
        logger.debug(debug)

        # Divide the document in the different part (intro, title,
        # text, footnotes)

        try:
            self.divide_document()
            logger.info('Division of the document ok.')
        except AphorismsToXMLException:
            logger.error('Division of the document failed.')
            raise AphorismsToXMLException

        self.treat_footnotes()

        self.aphorisms_dict()
        logger.info('Created aphorisms dictionary')

        if self._introduction != '':
            try:
                intro = Introduction(self._introduction, self._next_footnote)
                intro.xml_main()
                self._next_footnote = intro.next_footnote
                self.xml += intro.xml
                logger.debug('Introduction treated')
            except IntroductionException:
                raise AphorismsToXMLException from None

        # Deal with the first block of text which should contain
        # an optional intro
        # and the title
        # =======================================================

        try:
            title = Title(self._title, self._next_footnote, self.doc_num)
        except TitleException:
            raise AphorismsToXMLException from None
        logger.debug('Title treated')

        title.xml_main()
        logger.debug('Title xml created')

        self._next_footnote = title.next_footnote

        # Add title to the xml main
        self.xml += title.xml

        # Now process the rest of the main text
        # =====================================
        logger.debug('Start aphorisms and commentaries treatment')
        for k in self._aph_com:
            if not len(self._aph_com[k]):
                error = ('There are no aphorisms  in the file. '
                         'It can be because of the numeration. '
                         'Verify that the it is starting at 1 or 1. not .1 '
                         '(the point can be after the number but not before.')
                logger.error(error)
                raise AphorismsToXMLException

            aphorism = self._aph_com[k][0]
            commentaries = self._aph_com[k][1:]

            # Add initial XML for the aphorism + commentary unit
            self.xml.append(self.xml_oss * self.xml_n_offset + '<div n="' +
                            str(k) + '" type="aphorism_commentary_unit">')

            # Add initial XML for this aphorism
            self.xml.append(self.xml_oss * (self.xml_n_offset + 1) +
                            '<div type="aphorism">')
            self.xml.append(self.xml_oss * (self.xml_n_offset + 2) + '<p>')

            # Now process any witnesses in it. If this fails with an
            # Exception print an error and return
            try:
                line_ref = references(aphorism)
            except AnalysisException:
                error = ('Unable to process references in '
                         'aphorism {}'.format(k))
                logger.error(error)
                raise AphorismsToXMLException from None

            if line_ref is None or line_ref == '':
                continue

            # Process any footnotes in line_ref, if there are errors write
            # to the log file and return
            try:
                self.xml_n_offset += 3
                xml_main_to_add, self._next_footnote = \
                    footnotes(line_ref, self._next_footnote)
                self.xml_n_offset -= 3
            except (TypeError, AnalysisException):
                error = ('Unable to process footnotes in '
                         'aphorism {}'.format(k))
                logger.error(error)
                raise AphorismsToXMLException from None

            # Add the XML
            self.xml.extend(xml_main_to_add)

            # Close the XML for the aphorism
            self.xml.append(self.xml_oss * (self.xml_n_offset + 1) + '</p>')
            self.xml.append(self.xml_oss * self.xml_n_offset + '</div>')

            # Get the next line of text
            for n_com, line in enumerate(commentaries):

                # Workaround footnote on first word
                line = ' ' + line

                if line[-1] != '.':

                    debug = ('Commentaries should ended with a `.`\n'
                             'Warning in aphorism {}\n'
                             'commentary {}'.format(k, line))
                    logger.debug(debug)

                # Add initial XML for this aphorism's commentary
                self.xml.append(self.xml_oss * self.xml_n_offset +
                                '<div type="commentary">')
                self.xml.append(self.xml_oss * (self.xml_n_offset + 1) + '<p>')

                # Now process any witnesses in this line. If this fails with a
                # CommentaryToEpidocException and log an error
                try:
                    line_ref = references(line)
                except AnalysisException:
                    error = ('Unable to process references, '
                             'commentary {} for aphorism '
                             '{}'.format(n_com+1, k))
                    logger.error(error)
                    raise AphorismsToXMLException from None

                # Process any _footnotes in line_ref. If this fails with a
                # CommentaryToEpidocException and log an error
                try:
                    self.xml_n_offset += 3
                    xml_main_to_add, self._next_footnote = \
                        footnotes(line_ref, self._next_footnote)
                    self.xml_n_offset -= 3
                except (TypeError, AnalysisException):
                    error = ('Unable to process footnote, '
                             'commentary {} for aphorism '
                             '{}'.format(n_com+1, k))
                    logger.error(error)
                    raise AphorismsToXMLException from None

                # Add the XML
                self.xml.extend(xml_main_to_add)

                # Close the XML for this commentary
                self.xml.append(self.xml_oss * (self.xml_n_offset + 1) +
                                '</p>')
                self.xml.append(self.xml_oss * self.xml_n_offset +
                                '</div>')

            # Close the XML for the aphorism + commentary unit
            self.xml.append(self.xml_oss * self.xml_n_offset + '</div>')

        logger.debug('Finish aphorisms and commentaries treatment')
        # Save the xmls created

        self._create_xml()
        self.save_xml(self.xml_file)
        self._validate_xml()
        logger.debug('Save main xml')