feat: add dataset.publish and dataset.clone commands and corresponding functions (bd9d2707) · Commits · HIP / DataHIPy

datahipy/bids/dataset.py

+99 −0

Original line number	Diff line number	Diff line
		@@ -299,3 +299,102 @@ def get_all_datasets_content(
		with open(output_file, "w") as f:
		json.dump(datasets_desc, f, indent=4)
		print(SUCCESS)


		def dataset_publish(input_data, output_file):
		"""Publish a dataset to the public space of the HIP.

		Parameters
		----------
		input_data : str
		Path to the input_data JSON file in the following format::

		{
		"sourceDatasetPath": "/path/to/private/or/collab/dataset",
		"targetDatasetPath": "/path/of/dataset/to/be/published/to/public/space",
		}

		output_file : str
		Path to the output published dataset summary in JSON format
		to be indexed by the Data Search Engine of the HIP.
		"""
		# Load the HIP json request
		with open(input_data, "r") as f:
		input_content = json.load(f)
		# Extract the source and target dataset paths
		source_dataset_path = input_content["sourceDatasetPath"]
		target_dataset_path = input_content["targetDatasetPath"]
		# Create datalad dataset sibling to publish to
		datalad.api.create_sibling(
		name="public",
		dataset=source_dataset_path,
		sshurl=target_dataset_path,
		# Uncomment when public space could have https access
		# as it expects sshurl to have URL protocol to be http or https
		# as_common_datasrc=True,
		recursive=True
		)
		# Publish the dataset to the public space
		datalad.api.push(
		dataset=source_dataset_path,
		to="public",
		data="anything",
		recursive=True,
		force="all",
		on_failure="ignore"
		)
		# Get the content of the published dataset summary to
		# be saved in the output JSON file
		dataset_desc = get_bidsdataset_content(target_dataset_path)
		# Dump the dataset_desc dict in a .json file
		if output_file:
		with open(output_file, "w") as f:
		json.dump(dataset_desc, f, indent=4)
		print(SUCCESS)


		def dataset_clone(input_data, output_file):
		"""Clone a dataset from the public space of the HIP.

		Parameters
		----------
		input_data : str
		Path to the input_data JSON file in the following format::

		{
		"sourceDatasetPath": "/path/to/public/dataset",
		"targetDatasetPath": "/path/of/dataset/to/be/cloned/in/private/space",
		}

		output_file : str
		Path to the output cloned dataset summary in JSON format
		to be indexed by the Data Search Engine of the HIP.
		"""
		# Load the input_data json file in a dict
		with open(input_data, "r") as f:
		input_content = json.load(f)
		# Extract the source and target dataset paths
		source_dataset_path = input_content["sourceDatasetPath"]
		target_dataset_path = input_content["targetDatasetPath"]
		# Create the target dataset directory if it does not exist
		if not os.path.isdir(target_dataset_path):
		os.makedirs(target_dataset_path)
		# set_git_user_info(dataset_dir=target_dataset_path)
		# Clone the dataset from the public space
		datalad.api.install(
		source=source_dataset_path,
		path=target_dataset_path,
		description=f"Clone of {source_dataset_path}",
		get_data=True,
		reckless=None,
		recursive=True,
		on_failure="continue"
		)
		# Get the content of the cloned dataset summary to
		# be saved in the output JSON file
		dataset_desc = get_bidsdataset_content(target_dataset_path)
		# Dump the dataset_desc dict in a .json file
		if output_file:
		with open(output_file, "w") as f:
		json.dump(dataset_desc, f, indent=4)
		print(SUCCESS)

datahipy/cli/run.py

+22 −1

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@

		import argparse
		from datahipy import __version__, __release_date__
		from datahipy.bids.dataset import get_all_datasets_content
		from datahipy.bids.dataset import get_all_datasets_content, dataset_publish, dataset_clone
		from datahipy.handlers.dataset import DatasetHandler
		from datahipy.handlers.participants import ParticipantHandler
		from datahipy.handlers.project import create_project, import_subject, import_document
		@@ -21,6 +21,8 @@ VALID_COMMANDS = [
		"dataset.checkout_tag",
		"datasets.get",
		"dataset.release_version",
		"dataset.publish",
		"dataset.clone",
		"sub.get",
		"sub.import",
		"sub.edit.clinical",
		@@ -48,6 +50,16 @@ def get_parser():
		help="Path to the input data (e.g. input_data.json)",
		default="/input",
		)
		parser.add_argument(
		"--git_user_name",
		help="Git user name to use for Datalad ops",
		default=None
		)
		parser.add_argument(
		"--git_user_email",
		help="Git user email to use for Datalad ops",
		default=None
		)
		parser.add_argument(
		"-v",
		"--version",
		@@ -61,15 +73,20 @@ def get_parser():

		def main():
		"""Run the command line interface."""
		# Create parser object
		parser = get_parser()

		# Parse arguments
		cmd_args = parser.parse_args()
		command = cmd_args.command
		input_data = cmd_args.input_data
		output_file = cmd_args.output_file
		dataset_path = cmd_args.dataset_path
		input_path = cmd_args.input_path
		git_user_name = cmd_args.git_user_name
		git_user_email = cmd_args.git_user_email

		# Initialize dataset and participant handler objects
		dhdl = DatasetHandler(dataset_path=dataset_path)
		phdl = ParticipantHandler(dataset_path=dataset_path, input_path=input_path)

		@@ -94,6 +111,10 @@ def main():
		)
		if command == "dataset.release_version":
		return release_version(input_data=input_data, output_file=output_file)
		if command == "dataset.publish":
		return dataset_publish(input_data=input_data, output_file=output_file)
		if command == "dataset.clone":
		return dataset_clone(input_data=input_data, output_file=output_file)
		# Dataset subject / participant-level commands
		if command == "sub.import":
		return phdl.sub_import(input_data=input_data)