Source code for muspy.datasets.hymnal

"""Hymnal Dataset."""
from pathlib import Path
from typing import Optional, Union

import requests

from ..inputs import read_midi
from ..music import Music
from .base import DatasetInfo, FolderDataset

_NAME = "Hymnal Dataset"
_DESCRIPTION = """\
The Hymnal Dataset is a collection of hymns in MIDI format available at
hymnal.net."""
_HOMEPAGE = "https://www.hymnal.net/"


[docs]class HymnalDataset(FolderDataset):
    """Hymnal Dataset."""

    _info = DatasetInfo(_NAME, _DESCRIPTION, _HOMEPAGE)
    _extension = "mid"
    _type = "mid"

    def __init__(
        self,
        root: Union[str, Path],
        download: bool = False,
        convert: bool = False,
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        use_converted: Optional[bool] = None,
    ):

        self.root = Path(root).expanduser().resolve()
        if not self.root.exists():
            raise ValueError("`root` must be an existing path.")
        if not self.root.is_dir():
            raise ValueError("`root` must be a directory.")

        if download:
            self.download()

        super().__init__(
            root, convert, kind, n_jobs, ignore_exceptions, use_converted
        )

[docs]    def read(self, filename: Union[str, Path]) -> Music:
        """Read a file into a Music object."""
        return read_midi(self.root / filename)

[docs]    def download(self) -> "FolderDataset":
        """Download the source datasets.

        Returns
        -------
        Object itself.

        """
        # Maximum consecutive trials allowed to fail
        tolerance = 10

        kinds = ["Classic", "New Tunes", "New Songs", "Children"]
        keys = ["h", "nt", "ns", "c"]

        print("Downloading sources.")
        for kind, key in zip(kinds, keys):
            # Make sure the folder exists
            (self.root / kind).mkdir(exist_ok=True)

            # Reset the index and the consecutive failure counter
            idx = 1
            consecutive_failure_count = 0

            # Loop until the number of consecutive failures exceed tolerance
            while consecutive_failure_count < tolerance:
                # Send a HEAD request to check if the content type is MIDI
                url = "https://www.hymnal.net/en/hymn/{}/{}/f={}".format(
                    key, idx, self._type
                )
                req = requests.head(url)
                if req.headers["Content-Type"] != "audio/midi":
                    consecutive_failure_count += 1
                    continue

                # Send another HEAD request to check if we have exceeded the
                # total number of pieces -> When we request for an out of
                # bound index, it seems that it will randomly return another
                # piece. Thus, if the first and the second requests have
                # different content sizes, we can break the loop.
                second_req = requests.head(url)
                if (
                    second_req.headers["Content-Length"]
                    != req.headers["Content-Length"]
                ):
                    break

                # Send a GET request to get the MIDI file
                req = requests.get(url)
                filename = str(self.root / kind / (str(idx) + ".mid"))
                with open(filename, "wb") as f:
                    f.write(req.content)

                # Reset the consecutive failure counter
                if consecutive_failure_count:
                    consecutive_failure_count = 0

                idx += 1

                if idx % 100 == 0:
                    print("Successfully downloaded {} files.".format(idx))

        (self.root / ".muspy.success").touch(exist_ok=True)
        return self


[docs]class HymnalTuneDataset(FolderDataset):
    """Hymnal Dataset (tune only)."""

    _info = DatasetInfo(_NAME, _DESCRIPTION, _HOMEPAGE)
    _extension = "mid"
    _type = "tune"

    def __init__(
        self,
        root: Union[str, Path],
        download: bool = False,
        convert: bool = False,
        kind: str = "json",
        n_jobs: int = 1,
        ignore_exceptions: bool = True,
        use_converted: Optional[bool] = None,
    ):

        self.root = Path(root).expanduser().resolve()
        if not self.root.exists():
            raise ValueError("`root` must be an existing path.")
        if not self.root.is_dir():
            raise ValueError("`root` must be a directory.")

        if download:
            self.download()

        super().__init__(
            root, convert, kind, n_jobs, ignore_exceptions, use_converted
        )

[docs]    def read(self, filename: Union[str, Path]) -> Music:
        """Read a file into a Music object."""
        return read_midi(self.root / filename)

[docs]    def download(self) -> "FolderDataset":
        """Download the source datasets.

        Returns
        -------
        Object itself.

        """
        # Maximum consecutive trials allowed to fail
        tolerance = 10

        kinds = ["Classic", "New Tunes", "New Songs", "Children"]
        keys = ["h", "nt", "ns", "c"]

        print("Downloading sources.")
        for kind, key in zip(kinds, keys):
            # Make sure the folder exists
            (self.root / kind).mkdir(exist_ok=True)

            # Reset the index and the consecutive failure counter
            idx = 1
            consecutive_failure_count = 0

            # Loop until the number of consecutive failures exceed tolerance
            while consecutive_failure_count < tolerance:
                # Send a HEAD request to check if the content type is MIDI
                url = "https://www.hymnal.net/en/hymn/{}/{}/f={}".format(
                    key, idx, self._type
                )
                req = requests.head(url)
                if req.headers["Content-Type"] != "audio/midi":
                    consecutive_failure_count += 1
                    continue

                # Send another HEAD request to check if we have exceeded the
                # total number of pieces -> When we request for an out of
                # bound index, it seems that it will randomly return another
                # piece. Thus, if the first and the second requests have
                # different content sizes, we can break the loop.
                second_req = requests.head(url)
                if (
                    second_req.headers["Content-Length"]
                    != req.headers["Content-Length"]
                ):
                    break

                # Send a GET request to get the MIDI file
                req = requests.get(url)
                filename = str(self.root / kind / (str(idx) + ".mid"))
                with open(filename, "wb") as f:
                    f.write(req.content)

                # Reset the consecutive failure counter
                if consecutive_failure_count:
                    consecutive_failure_count = 0

                idx += 1

                if idx % 100 == 0:
                    print("Successfully downloaded {} files.".format(idx))

        (self.root / ".muspy.success").touch(exist_ok=True)
        return self