Source code for dkube.sdk.rsrcs.dataset

from __future__ import print_function

import sys
import time
from pprint import pprint

from dkube.sdk.internal import dkube_api
from dkube.sdk.internal.dkube_api.models.datum_model import DatumModel
from dkube.sdk.internal.dkube_api.models.datum_model_hostpath import \
    DatumModelHostpath
from dkube.sdk.internal.dkube_api.models.datum_model_k8svolume import \
    DatumModelK8svolume
from dkube.sdk.internal.dkube_api.models.sql_access_info import SQLAccessInfo
from dkube.sdk.internal.dkube_api.models.gcs_access_info import GCSAccessInfo
from dkube.sdk.internal.dkube_api.models.git_access_credentials import \
    GitAccessCredentials
from dkube.sdk.internal.dkube_api.models.git_access_info import GitAccessInfo
from dkube.sdk.internal.dkube_api.models.nfs_access_info import NFSAccessInfo
from dkube.sdk.internal.dkube_api.models.redshift_access_info import \
    RedshiftAccessInfo
from dkube.sdk.internal.dkube_api.models.repo_gcs_access_info_secret import \
    RepoGCSAccessInfoSecret
from dkube.sdk.internal.dkube_api.models.s3_access_credentials import \
    S3AccessCredentials
from dkube.sdk.internal.dkube_api.rest import ApiException

from .util import *


[docs]class DkubeDataset(object): """ This class defines the DKube dataset with helper functions to set properties of dataset.:: from dkube.sdk import * mnist = DkubeDataset("oneconv", name="mnist") Where first argument is the user of this dataset. User should be a valid onboarded user in dkube. """ DATASET_SOURCES = ["dvs", "git", "aws_s3", "s3", "gcs", "nfs", "redshift", "k8svolume","sql"] """ List of valid datasources in DKube. Some datasources are downloaded while some are remotely referenced. :bash:`dvs` :- To create an empty repository which can be used in future runs. :bash:`git` :- If data is in the git repo. All git compatible repos are supported - github, bitbucket, gitlab. :bash:`Downloaded` :bash:`aws_s3` :- If the data is in AWS s3 bucket. :bash:`Downloaded | Remote` :bash:`s3` :- Non aws s3 data source. Like MinIO deployed on internal cluster. :bash:`Downloaded | Remote` :bash:`gcs` :- Google cloud storage as data source. :bash:`Downloaded` :bash:`nfs` :- External NFS server as data source. :bash:`Remote` :bash:`redshift` :- Redshift as data source. :bash:`Remote` :bash:`k8svolume` :- Kubernetes volume as data source. :bash:`Remote` :bash:`hostpath` :- If data is in a path in host machine. :bash:`Remote` :bash:`sql` :- sql dataset source :bash:`Remote` """ GIT_ACCESS_OPTS = ["apikey", "sshkey", "password"] """ List of authentication options supported for git data source. :bash:`apikey` :- Github APIKey based authentication. This must have permission on the repo to clone and checkout. :bash:`sshkey` :- Git SSH key based authentication. :bash:`password` :- Standard username/password based. """ def __init__( self, user, name=generate("dataset"), remote=False, tags=None): self.k8svolume = DatumModelK8svolume(name=None) self.sql = SQLAccessInfo( host=None, port=None, username=None, password=None, database=None, odbc_connection_string=None, jdbc_connection_string=None, cacert=None, provider=None) self.redshift = RedshiftAccessInfo( endpoint=None, username=None, password=None, database=None, region=None, cacert=None, insecure_ssl=None) self.nfsaccess = NFSAccessInfo(server=None, path=None) self.gcssecret = RepoGCSAccessInfoSecret(name=None, content=None) self.gcsaccess = GCSAccessInfo( bucket=None, prefix=None, secret=self.gcssecret) self.s3access = S3AccessCredentials( access_key_id=None, access_key=None, bucket=None, prefix=None, endpoint=None) self.gitcreds = GitAccessCredentials( username=None, password=None, apikey=None, sshkey=None, private=True) self.gitaccess = GitAccessInfo( path=None, url=None, branch=None, credentials=self.gitcreds) self.hostpath = DatumModelHostpath( path=None) self.datum = DatumModel( name=None, tags=None, _class='dataset', dvs=None, source='dvs', url=None, remote=remote, gitaccess=self.gitaccess, sql=self.sql, s3access=self.s3access, nfsaccess=self.nfsaccess, gcsaccess=self.gcsaccess, hostpath=self.hostpath, redshift=self.redshift) self.extract = False self.update_basic(user, name, tags) def update_basic(self, user, name, tags): tags = list_of_strs(tags) self.user = user self.name = name self.datum.name = name self.datum.tags = tags
[docs] def update_dataset_source(self, source=DATASET_SOURCES[0]): """ Method to update the source for this dataset. It should be one of the choice mentioned in DATASET_SOURCES Default value is **git** """ self.datum.source = source
[docs] def update_git_details( self, url, branch=None, authopt=GIT_ACCESS_OPTS[0], authval=None): """ Method to update the details of git datasource. *Inputs* url A valid Git URL. Following are considered as valid URLs. - CloneURL : https://github.com/oneconvergence/dkube.git - TreeURL : https://github.com/oneconvergence/dkube/tree/2.1.dev/dkube - BlobURL : https://github.com/oneconvergence/dkube/blob/2.1.dev/dkube/sdk/api.py - ZipURL : https://github.com/oneconvergence/dkube/archive/2.1.dev.zip branch Valid branch of git repo. If not provided then **master** branch is used by default. authopt One of the valid option from **GIT_ACCESS_OPTS** authval Value corresponding to the authopt """ self.datum.source = "git" self.datum.url = url self.gitaccess.url = url self.gitaccess.branch = branch self.gitcreds.username = self.user if authopt == 'apikey': self.gitcreds.apikey = authval elif authopt == 'password': self.gitcreds.password = authval elif authopt == 'sshkey': self.gitcreds.sshkey = authval
[docs] def update_awss3_details(self, bucket, prefix, key, secret): """ Method to update details of aws s3 data source. *Inputs* bucket Valid bucket in aws s3 prefix Path to an object in the bucket. Dkube will fetch recursively all objects under this prefix. key AWS s3 access key id secret AWS s3 access key secret """ self.datum.source = "aws_s3" self.datum.url = "/" + bucket + "/" + prefix self.s3access.bucket = bucket self.s3access.prefix = prefix self.s3access.access_key_id = key self.s3access.access_key = secret
[docs] def update_s3_details(self, endpoint, bucket, prefix, key, secret): """ Method to update details of s3 data source like minio. *Inputs* bucket Valid bucket name in s3 store prefix Path to an object in the bucket. Dkube will fetch recursively all objects under this prefix. key S3 access key id secret s3 access key secret """ self.datum.source = "s3" self.s3access.endpoint = endpoint self.s3access.prefix = prefix self.s3access.bucket = bucket self.s3access.access_key_id = key self.s3access.access_key = secret
[docs] def update_gcs_details(self, bucket, prefix, key, secret): """ Method to update details of google cloud storage. *Inputs* bucket Valid bucket in GCS prefix Path to an object in bucket. Dkube will fetch recursively all objects under this prefix. key Name of the GCS secret secret Content of the secret """ self.datum.source = "gcs" self.gcsaccess.bucket = bucket self.gcsaccess.prefix = prefix self.gcssecret.name = key self.gcssecret.content = secret
[docs] def update_nfs_details(self, server, path="/"): """ Method to update details of nfs data source. *Inputs* server IP address of the nfs server. path Path in the nfs export. This path is directly mounted for the user program. """ self.datum.source = "nfs" self.nfsaccess.path = path self.nfsaccess.server = server
[docs] def update_redshift_details( self, endpoint, database, user=None, password=None): """ Method to update details of redshift data source. *Inputs* endpoint Redshift endpoint password Login password. Username is picked up from the login name in DKube. database Database in redshift to connect to. region AWS region in which the redshift is setup. """ self.datum.source = "redshift" self.redshift.endpoint = endpoint if user: self.redshift.username = user else: self.redshift.username = self.user self.redshift.password = password self.redshift.database = database self.redshift.insecure_ssl = True self.datum.remote = True
[docs] def update_k8svolume_details(self, name): """ Method to update details of k8s volume data source. *Inputs* name Name of the kubernetes volume. Volume should not be already **Bound**. """ self.datum.source = "k8svolume" self.k8svolume.name = name
[docs] def update_hostpath_details(self, path): """ Method to update details of hostpath. *Inputs* path Location in the host machine where the data is stored. """ self.datum.source = "hostpath" self.datum.remote = True self.hostpath.path = path
[docs] def update_puburl_details(self, url, extract): """ Method to update details of pub_url data source. *Inputs* url pub_url of the data extract if set to True, data will be extracted """ self.datum.source = "pub_url" self.datum.url = url self.extract = extract
[docs] def update_sql_details(self,provider,host='',port=None,username=None,password=None,database='',odbc_connection_string=None,jdbc_connection_string=None): """ Method to update details of sql data source. *Inputs* provider possible values are 'oracle','mysql' and 'mssql' (string) host host address (string) port port number (integer) username username for accessing the database (string) password password for accessing the database (string) database name of the sql database (string) odbc_connection_string odbc connection string jdbc_connection_string jdbc connection string """ self.datum.source = "sql" self.datum.url = "sql:"+ host + ":" + str(port) + ":" + database self.sql.provider = provider self.sql.host = host self.sql.port = port self.sql.username = username self.sql.password = password self.sql.database = database self.sql.odbc_connection_string = odbc_connection_string self.sql.jdbc_connection_string = jdbc_connection_string