Source code for reach_tools.utils

import re
from importlib.util import find_spec
import logging
from html.parser import HTMLParser
from pathlib import Path
from typing import Union

from html2text import html2text

from . import reference
from .logging_utils import configure_logging, format_pandas_for_logging

__all__ = [
    "reference",
    "build_data_directory",
    "configure_logging",
    "strip_html_tags",
    "cleanup_string",
]


# helper for cleaning up HTML strings
# From - https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class _MLStripper(HTMLParser):

    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self) -> str:
        return "".join(self.fed)



[docs]
def strip_html_tags(html: str) -> str:
    """
    Remove HTML tags from a string.

    Args:
        html: HTML string to be cleaned.

    Returns:
        String with HTML tags removed.
    """
    s = _MLStripper()
    s.feed(html)
    return s.get_data()




[docs]
def build_data_directory(dir_path: Union[str, Path]) -> Path:
    """
    Create a directory in the specified path.

    .. note::
        If the parents for the directory path do not exist, they will automatically be created.

    Args:
        dir_path: Path where directory shall be created.

    Returns:
        Path to directory location.
    """
    # make sure working with a Path
    if isinstance(dir_path, str):
        dir_path = Path(dir_path)

    # if already exists, leave it alone
    if dir_path.exists():
        logging.debug(f'Directory already exists, so not recreating, "{dir_path}"')

    # if does not exist, create it
    else:
        dir_path.mkdir(parents=True)
        logging.info(f'Created directory at "{dir_path}"')

    return dir_path



def build_data_resources(data_dir: Union[str, Path]) -> Path:
    """
    Build out standard data directory structure in location where data shall reside for the project.

    Args:
        data_dir: Path to where data directory resides.

    Returns:
        Path to data directory.
    """
    # make sure working with a Path
    if isinstance(data_dir, str):
        data_dir = Path(data_dir)

    # build the parent data directory
    build_data_directory(data_dir)

    # build the four directories for the different types of data
    build_data_directory(data_dir / "external")
    build_data_directory(data_dir / "raw")
    build_data_directory(data_dir / "interim")
    build_data_directory(data_dir / "processed")

    return data_dir



[docs]
def cleanup_string(input_string: str) -> str:
    """Helper function to clean up description strings."""

    # ensure something to work with
    if len(input_string) == 0:
        return input_string

    # convert to markdown first, so any reasonable formatting is retained
    cleanup = html2text(input_string)

    # since people love to hit the space key multiple times in stupid places, get rid of multiple space, but leave
    # newlines in there since they actually do contribute to formatting
    cleanup = re.sub(r"\s{2,}", " ", cleanup)

    # apparently some people think it is a good idea to hit return more than twice...account for this foolishness
    cleanup = re.sub(r"\n{3,}", "\n\n", cleanup)
    cleanup = re.sub("(.)\n(.)", "\g<1> \g<2>", cleanup)

    # get rid of any trailing newlines at end of entire text block
    cleanup = re.sub(r"\n+$", "", cleanup)

    # correct any leftover standalone links
    cleanup = cleanup.replace("<", "[").replace(">", "]")

    # get rid of any leading or trailing spaces
    cleanup = cleanup.strip()

    # finally call it good
    return cleanup