importrefromimportlib.utilimportfind_specimportloggingfromhtml.parserimportHTMLParserfrompathlibimportPathfromtypingimportUnionfromhtml2textimporthtml2textfrom.importreferencefrom.logging_utilsimportconfigure_logging,format_pandas_for_logging__all__=["reference","build_data_directory","configure_logging","strip_html_tags","cleanup_string",]# helper for cleaning up HTML strings# From - https://stackoverflow.com/questions/753052/strip-html-from-strings-in-pythonclass_MLStripper(HTMLParser):def__init__(self):super().__init__()self.reset()self.strict=Falseself.convert_charrefs=Trueself.fed=[]defhandle_data(self,d):self.fed.append(d)defget_data(self)->str:return"".join(self.fed)
[docs]defstrip_html_tags(html:str)->str:""" Remove HTML tags from a string. Args: html: HTML string to be cleaned. Returns: String with HTML tags removed. """s=_MLStripper()s.feed(html)returns.get_data()
[docs]defbuild_data_directory(dir_path:Union[str,Path])->Path:""" Create a directory in the specified path. .. note:: If the parents for the directory path do not exist, they will automatically be created. Args: dir_path: Path where directory shall be created. Returns: Path to directory location. """# make sure working with a Pathifisinstance(dir_path,str):dir_path=Path(dir_path)# if already exists, leave it aloneifdir_path.exists():logging.debug(f'Directory already exists, so not recreating, "{dir_path}"')# if does not exist, create itelse:dir_path.mkdir(parents=True)logging.info(f'Created directory at "{dir_path}"')returndir_path
defbuild_data_resources(data_dir:Union[str,Path])->Path:""" Build out standard data directory structure in location where data shall reside for the project. Args: data_dir: Path to where data directory resides. Returns: Path to data directory. """# make sure working with a Pathifisinstance(data_dir,str):data_dir=Path(data_dir)# build the parent data directorybuild_data_directory(data_dir)# build the four directories for the different types of databuild_data_directory(data_dir/"external")build_data_directory(data_dir/"raw")build_data_directory(data_dir/"interim")build_data_directory(data_dir/"processed")returndata_dir
[docs]defcleanup_string(input_string:str)->str:"""Helper function to clean up description strings."""# ensure something to work withiflen(input_string)==0:returninput_string# convert to markdown first, so any reasonable formatting is retainedcleanup=html2text(input_string)# since people love to hit the space key multiple times in stupid places, get rid of multiple space, but leave# newlines in there since they actually do contribute to formattingcleanup=re.sub(r"\s{2,}"," ",cleanup)# apparently some people think it is a good idea to hit return more than twice...account for this foolishnesscleanup=re.sub(r"\n{3,}","\n\n",cleanup)cleanup=re.sub("(.)\n(.)","\g<1> \g<2>",cleanup)# get rid of any trailing newlines at end of entire text blockcleanup=re.sub(r"\n+$","",cleanup)# correct any leftover standalone linkscleanup=cleanup.replace("<","[").replace(">","]")# get rid of any leading or trailing spacescleanup=cleanup.strip()# finally call it goodreturncleanup