Source code for owmeta_movement.zenodo

from contextlib import contextmanager
import logging
from os import makedirs
from os.path import join as p, isfile
import re
import shutil

from bs4 import BeautifulSoup
from owmeta.document import BaseDocument
from owmeta_core.data_trans.local_file_ds import LocalFileDataSource
from owmeta_core.context import ClassContext
from owmeta_core.dataobject import DatatypeProperty
from owmeta_core.datasource_loader import DataSourceDirLoader, LoadFailed
from owmeta_core.datasource import Informational
import requests

from . import CONTEXT as MOVEMENT_CONTEXT

L = logging.getLogger(__name__)

SCHEMA_URL = 'http://schema.openworm.org/2020/07/sci/bio/movement/zenodo'

CONTEXT = ClassContext(ident=SCHEMA_URL,
        imported=(MOVEMENT_CONTEXT,),
        base_namespace=SCHEMA_URL + '#')


_ZENODO_BASE_URL = 'https://zenodo.org'


[docs]class ZenodoRecordDirLoader(DataSourceDirLoader): ''' Provides files by downloading them from Zonodo. ''' def __init__(self, base_directory=None, session_provider=None, **kwargs): ''' Parameters ---------- base_directory : str, optional Path to a directory where files will be saved when requested. An attempt will be made to create the directory if it does not already exist. The files created in this directory may be reused by other instances *of the same version* of this class. session_provider : callable, optional Should return a requests.Session for the sake of making requests to Zenodo. By default, will use a new session for every request ''' super().__init__(base_directory=base_directory, **kwargs) if session_provider is None: session_provider = lambda: requests.Session() self._session_provider = session_provider def can_load(self, ob): try: zenodo_id = ob.zenodo_id() except AttributeError: L.debug('Missing zenodo_id property for %s. Cannot download any files', ob) return False try: file_name = ob.zenodo_file_name() except AttributeError: file_name = None L.debug('Missing zenodo_file_name for %s. Will download all files in the' ' record...', ob) if not zenodo_id: L.debug('zenodo_id value is invalid: %s', zenodo_id) return False if file_name is not None and not file_name: L.debug('zenodo_file_name value is invalid: %s', file_name) return False # Check the zenodo file is reachable by try to grab the HEAD response for it # zenodo_base_url is entirely optional zenodo_base_url_prop = getattr(ob, 'zenodo_base_url', None) if zenodo_base_url_prop is not None: zenodo_base_url = zenodo_base_url_prop() else: zenodo_base_url = None base_url = zenodo_base_url or _ZENODO_BASE_URL if not file_name: url = _record_url(base_url, zenodo_id) else: url = _file_url(base_url, zenodo_id, file_name) session = self._session_provider() response = session.head(url) return response.status_code == 200 def load(self, data_source): try: zenodo_id = str(data_source.zenodo_id()) except AttributeError: raise LoadFailed(data_source, self, 'Missing zenodo_id property') try: file_name = data_source.zenodo_file_name() except AttributeError: file_name = None L.debug('Missing zenodo_file_name for %s. Will download all files in the' ' record...', data_source) recorddir = p(self.base_directory, zenodo_id) makedirs(recorddir, exist_ok=True) zenodo_base_url_prop = getattr(data_source, 'zenodo_base_url', None) if zenodo_base_url_prop is not None: zenodo_base_url = zenodo_base_url_prop() else: zenodo_base_url = None zenodo_base_url = zenodo_base_url or _ZENODO_BASE_URL # May re-evaluate this for resilence -- as it is, we could fail part-way # through and have to redo everything if file_name: files = [file_name] else: # Yeah, I know they have an API. Don't care. session = self._session_provider() files = list(list_record_files(zenodo_id, zenodo_base_url=zenodo_base_url, session=session)) if not files: raise LoadFailed(data_source, self, 'Could not find any files') for file_name in files: dest_file_name = p(recorddir, file_name) if isfile(dest_file_name): # TODO: check the hash of the file is the one we expect pass else: with self._download_from_zenodo(zenodo_id, file_name, zenodo_base_url) as response: if response.status_code != 200: raise LoadFailed(data_source, self, f'Missing file {file_name}') with open(dest_file_name, 'wb') as dest_file: # Zenodo seems to assign a distinct record ID for each version of a # record, so we shouldn't have to worry about conflicts here shutil.copyfileobj(response.raw, dest_file) return recorddir @contextmanager def _download_from_zenodo(self, zenodo_id, file_name, base_url): ''' Download a file from zenodo. The response should be used in a context manager to ensure it is closed properly. Parameters ---------- zenodo_id : str The Zenodo record ID file_name : str The file name for base_url : str The base zenodo URL ''' file_url = _file_url(base_url, zenodo_id, file_name) session = self._session_provider() with session.get(file_url, stream=True) as response: yield response
[docs]def list_record_files(zenodo_id, session=None, zenodo_base_url=None): ''' List files in a Zenodo record Parameters ---------- zenodo_id : int Zenodo record ID for which files should be listed session : requests.Session, optional The session to use for requests to Zenodo. Creates a default `requests.Session` if not provided. zenodo_base_url : str, optional The base URL for zenodo. Uses the common Zenodo URL if not provided Yields ------ str File names of records ''' if session is None: session = requests.Session() if zenodo_base_url is None: zenodo_base_url = _ZENODO_BASE_URL with session.get(_record_url(zenodo_base_url, zenodo_id), stream=True) as response: soup = BeautifulSoup(response.content, 'html.parser') re_safe_id = re.escape(str(zenodo_id)) file_ref_re = re.compile(rf'/record/{re_safe_id}/files/(.*)\?download=1') link_elems = soup.find_all(class_='filename', href=file_ref_re) if not link_elems: return for elem in link_elems: md = file_ref_re.match(elem['href']) if md: yield md.group(1) else: L.warning('Regular expression does not match twice?? I guess BeautifulSoup4 is broken.')
def _record_url(base_url, zenodo_id): return f'{base_url}/record/{zenodo_id}' def _file_url(base_url, zenodo_id, file_name): return f'{base_url}/record/{zenodo_id}/files/{file_name}?download=1'
[docs]class ZenodoFileDataSource(LocalFileDataSource): ''' A `LocalFileDataSource` that gets its data from Zenodo. Mostly these come from the OpenWorm Movement Database community. There are differences between how different zenodo entries in this community package their data, so sub-classes should handle the details https://zenodo.org/communities/open-worm-movement-database/?page=1&size=20 ''' class_context = CONTEXT zenodo_base_url = Informational(description='Base Zenodo URL. Should use the well-known' ' site URL if this property is unavailable', multiple=False) zenodo_id = Informational(description='Record ID from Zenodo', multiple=False) zenodo_file_name = Informational(description='Name of a file in a Zenodo record in' ' `zenodo_id`', multiple=False, also=LocalFileDataSource.file_name)
[docs]class ZenodoRecord(BaseDocument): ''' Represents a Zenodo record ''' class_context = CONTEXT zenodo_id = DatatypeProperty(__doc__='Record ID from Zenodo', multiple=False) key_property = zenodo_id