""" Module from http://github.com/ponteineptique/archives_org_latin_toolkit
"""
from pandas import read_csv
import os
import re
import multiprocessing
import math
__numb__ = re.compile("([-]?\d+( BCE)?)")
[docs]def bce(x):
""" Format A BCE string
:param x: Value to parse
:type x: str
:return: Parsed numeral
:rtype: str
"""
if "BCE" in x:
return ("-" + x.replace(" BCE", "")).replace("--", "-")
return x
[docs]def period(x):
""" Parse a period in metadata. If there is multiple dates, returns the mean
:param x: Value to parse
:type x: str
:return: Parsed numeral
:rtype: int
"""
dates = [
int(bce(number))
for number, _ in __numb__.findall(x)
]
return math.ceil(sum(dates)/len(dates))
[docs]class Text:
""" Text reading object for archive_org
:param file: File path
:type file: str
:param metadata: Metadata registry
:type metadata: Metadata
:param lowercase: Clean Text will be in lowercase
:type lowercase: bool
:ivar name: Name of the file
:type name: str
:ivar composed: Date of composition
:type composed: int
"""
__entities = re.compile("&\w+;")
__punct = re.compile("[^a-zA-Z]+")
__space = re.compile("[\s]+")
def __init__(self, file, metadata=None, lowercase=False):
self.__file__ = file
self.__date__ = None
self.__raw__ = None
self.__clean__ = None
self.__lower__ = lowercase
self.__metadata__ = metadata
@property
def name(self):
return self.__file__.split("/")[-1]
@property
def composed(self):
if self.__metadata__:
if not self.__date__:
self.__date__ = self.__metadata__.getDate(self.__file__)
return self.__date__
@property
def raw(self):
if not self.__raw__:
with open(self.__file__) as f:
self.__raw__ = f.read()
return self.__raw__
@property
def clean(self):
""" Clean version of the text : normalized space, remove new line, dehyphenize, remove punctuation and number.
"""
if not self.__clean__:
self.__clean__ = self.__space.sub(
" ",
self.__punct.sub(
" ",
self.__entities.sub(" ", self.raw.replace("-\n", "").replace("\n", " "))
)
)
if self.__lower__:
self.__clean__ = self.__clean__.lower()
return self.__clean__
[docs] def cleanUp(self):
""" Clean textual information and free RAM
"""
self.__raw__ = None
self.__clean__ = None
[docs] def has_strings(self, *strings):
""" Check if given string is in the file
:param strings: Strings as multiple arguments
:return: If found, return True
:rtype: bool
"""
status = False
for string in strings:
if string in self.raw:
status = True
break
return status
[docs] def find_embedding(self, *strings, window=50, ignore_center=False, memory_efficient=True):
""" Check if given string is in the file
:param strings: Strings as multiple arguments
:param window: Number of lines to retrieve
:param ignore_center: Remove the word found from the embedding
"""
array = self.clean.split()
strings = list(strings)
for i, x in enumerate(array):
if x in strings:
if ignore_center:
yield [w for w in __window__(array, window, i) if w != x]
else:
yield __window__(array, window, i)
if memory_efficient:
self.cleanUp()
[docs]class Repo:
""" Repo reading object for archive_org
:param file: File path
:type file: str
:param metadata: Metadata registry
:type metadata: Metadata
:param lowercase: Clean Text will be in lowercase
:type lowercase: bool
"""
def __init__(self, directory, metadata=None, lowercase=False):
self.__directory__ = directory
self.__metadata__ = metadata
self.__files__ = {
file: Text(os.path.join(root, file), metadata, lowercase=lowercase)
for root, dirs, files in os.walk(directory)
for file in files
}
[docs] def get(self, identifier):
""" Get the Text object given its identifier
:param identifier: Filename or identifier
:type identifier: str
:return: Text object
:rtype: Text
"""
return self.__files__[identifier]
[docs] def find(self, *strings, multiprocess=None, memory_efficient=True):
""" Find files who contains given strings
:param strings: Strings as multiple arguments
:param multiprocess: Number of process to spawn
:type multiprocess: int
:param memory_efficient: Drop the content of files to avoid filling the ram with unused content
:type memory_efficient: bool
:return: Files who are matching the strings
:rtype: generator
"""
if isinstance(multiprocess, int):
files = list(self.__files__.values())
chunksize = int(math.ceil(len(files) / float(multiprocess)))
kwargs = [
(strings, files[chunksize * i:chunksize * (i + 1)], memory_efficient)
for i in range(multiprocess)
]
pool = multiprocessing.Pool(multiprocess)
for result in pool.imap_unordered(__find_multiprocess__, kwargs):
for element in result:
yield element
else:
for file in self.__files__.values():
if file.has_strings(*strings):
yield file
self.__files__[file.name].__raw__ = None
self.__files__[file.name].__clean__ = None
[docs]def __find_multiprocess__(args):
""" Find files who contains given strings
:param args: Tuple where first element are Strings as list and second element is list of file objects
:return: Files who are matching the strings
:rtype: list
"""
strings, files, memoryefficient = args
results = []
while len(files):
file = files.pop()
if file.has_strings(*strings):
results.append(file)
file.cleanUp()
return results
[docs]def __window__(array, window, i):
""" Compute embedding using i
:param strings:
:param window: Number of word to take left, then right [ len(result) = (2*window)+1 ]
:param i: Index of the word
:param memory_efficient: Drop the content of files to avoid filling the ram with unused content
:type memory_efficient: bool
:return: List of words
"""
return array[max(i-window, 0):min(i+window+1, len(array))]