Commit bd9d2707 authored by Sébastien Tourbier's avatar Sébastien Tourbier
Browse files

feat: add dataset.publish and dataset.clone commands and corresponding functions

parent de489e24
Loading
Loading
Loading
Loading
+99 −0
Original line number Diff line number Diff line
@@ -299,3 +299,102 @@ def get_all_datasets_content(
        with open(output_file, "w") as f:
            json.dump(datasets_desc, f, indent=4)
    print(SUCCESS)


def dataset_publish(input_data, output_file):
    """Publish a dataset to the public space of the HIP.

    Parameters
    ----------
    input_data : str
        Path to the input_data JSON file in the following format::

            {
                "sourceDatasetPath": "/path/to/private/or/collab/dataset",
                "targetDatasetPath": "/path/of/dataset/to/be/published/to/public/space",
            }

    output_file : str
        Path to the output published dataset summary in JSON format
        to be indexed by the Data Search Engine of the HIP.
    """
    # Load the HIP json request
    with open(input_data, "r") as f:
        input_content = json.load(f)
    # Extract the source and target dataset paths
    source_dataset_path = input_content["sourceDatasetPath"]
    target_dataset_path = input_content["targetDatasetPath"]
    # Create datalad dataset sibling to publish to
    datalad.api.create_sibling(
        name="public",
        dataset=source_dataset_path,
        sshurl=target_dataset_path,
        # Uncomment when public space could have https access
        # as it expects sshurl to have URL protocol to be http or https
        # as_common_datasrc=True,
        recursive=True
    )
    # Publish the dataset to the public space
    datalad.api.push(
        dataset=source_dataset_path,
        to="public",
        data="anything",
        recursive=True,
        force="all",
        on_failure="ignore"
    )
    # Get the content of the published dataset summary to
    # be saved in the output JSON file
    dataset_desc = get_bidsdataset_content(target_dataset_path)
    # Dump the dataset_desc dict in a .json file
    if output_file:
        with open(output_file, "w") as f:
            json.dump(dataset_desc, f, indent=4)
    print(SUCCESS)


def dataset_clone(input_data, output_file):
    """Clone a dataset from the public space of the HIP.

    Parameters
    ----------
    input_data : str
        Path to the input_data JSON file in the following format::

            {
                "sourceDatasetPath": "/path/to/public/dataset",
                "targetDatasetPath": "/path/of/dataset/to/be/cloned/in/private/space",
            }
    
    output_file : str
        Path to the output cloned dataset summary in JSON format
        to be indexed by the Data Search Engine of the HIP.
    """
    # Load the input_data json file in a dict
    with open(input_data, "r") as f:
        input_content = json.load(f)
    # Extract the source and target dataset paths
    source_dataset_path = input_content["sourceDatasetPath"]
    target_dataset_path = input_content["targetDatasetPath"]
    # Create the target dataset directory if it does not exist
    if not os.path.isdir(target_dataset_path):
        os.makedirs(target_dataset_path)
    # set_git_user_info(dataset_dir=target_dataset_path)
    # Clone the dataset from the public space
    datalad.api.install(
        source=source_dataset_path,
        path=target_dataset_path,
        description=f"Clone of {source_dataset_path}",
        get_data=True,
        reckless=None,
        recursive=True,
        on_failure="continue"
    )
    # Get the content of the cloned dataset summary to
    # be saved in the output JSON file
    dataset_desc = get_bidsdataset_content(target_dataset_path)
    # Dump the dataset_desc dict in a .json file
    if output_file:
        with open(output_file, "w") as f:
            json.dump(dataset_desc, f, indent=4)
    print(SUCCESS)
+22 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@

import argparse
from datahipy import __version__, __release_date__
from datahipy.bids.dataset import get_all_datasets_content
from datahipy.bids.dataset import get_all_datasets_content, dataset_publish, dataset_clone
from datahipy.handlers.dataset import DatasetHandler
from datahipy.handlers.participants import ParticipantHandler
from datahipy.handlers.project import create_project, import_subject, import_document
@@ -21,6 +21,8 @@ VALID_COMMANDS = [
    "dataset.checkout_tag",
    "datasets.get",
    "dataset.release_version",
    "dataset.publish",
    "dataset.clone",
    "sub.get",
    "sub.import",
    "sub.edit.clinical",
@@ -48,6 +50,16 @@ def get_parser():
        help="Path to the input data (e.g. input_data.json)",
        default="/input",
    )
    parser.add_argument(
        "--git_user_name",
        help="Git user name to use for Datalad ops",
        default=None
    )
    parser.add_argument(
        "--git_user_email",
        help="Git user email to use for Datalad ops",
        default=None
    )
    parser.add_argument(
        "-v",
        "--version",
@@ -61,15 +73,20 @@ def get_parser():

def main():
    """Run the command line interface."""
    # Create parser object
    parser = get_parser()

    # Parse arguments
    cmd_args = parser.parse_args()
    command = cmd_args.command
    input_data = cmd_args.input_data
    output_file = cmd_args.output_file
    dataset_path = cmd_args.dataset_path
    input_path = cmd_args.input_path
    git_user_name = cmd_args.git_user_name
    git_user_email = cmd_args.git_user_email

    # Initialize dataset and participant handler objects
    dhdl = DatasetHandler(dataset_path=dataset_path)
    phdl = ParticipantHandler(dataset_path=dataset_path, input_path=input_path)

@@ -94,6 +111,10 @@ def main():
        )
    if command == "dataset.release_version":
        return release_version(input_data=input_data, output_file=output_file)
    if command == "dataset.publish":
        return dataset_publish(input_data=input_data, output_file=output_file)
    if command == "dataset.clone":
        return dataset_clone(input_data=input_data, output_file=output_file)
    # Dataset subject / participant-level commands
    if command == "sub.import":
        return phdl.sub_import(input_data=input_data)