Source code for dkube.sdk.rsrcs.dataset

from __future__ import print_function

import sys
import time
from pprint import pprint

from dkube.sdk.internal import dkube_api
from dkube.sdk.internal.dkube_api.models.datum_model import DatumModel
from dkube.sdk.internal.dkube_api.models.datum_model_hostpath import \
    DatumModelHostpath
from dkube.sdk.internal.dkube_api.models.datum_model_k8svolume import \
    DatumModelK8svolume
from dkube.sdk.internal.dkube_api.models.sql_access_info import SQLAccessInfo
from dkube.sdk.internal.dkube_api.models.gcs_access_info import GCSAccessInfo
from dkube.sdk.internal.dkube_api.models.git_access_credentials import \
    GitAccessCredentials
from dkube.sdk.internal.dkube_api.models.git_access_info import GitAccessInfo
from dkube.sdk.internal.dkube_api.models.nfs_access_info import NFSAccessInfo
from dkube.sdk.internal.dkube_api.models.redshift_access_info import \
    RedshiftAccessInfo
from dkube.sdk.internal.dkube_api.models.repo_gcs_access_info_secret import \
    RepoGCSAccessInfoSecret
from dkube.sdk.internal.dkube_api.models.s3_access_credentials import \
    S3AccessCredentials
from dkube.sdk.internal.dkube_api.rest import ApiException

from .util import *


[docs]class DkubeDataset(object):

    """

        This class defines the DKube dataset with helper functions to set properties of dataset.::

            from dkube.sdk import *
            mnist = DkubeDataset("oneconv", name="mnist")

            Where first argument is the user of this dataset. User should be a valid onboarded user in dkube.

    """

    DATASET_SOURCES = ["dvs", "git", "aws_s3",
                       "s3", "gcs", "nfs", "redshift", "k8svolume","sql"]
    """
	List of valid datasources in DKube.
	Some datasources are downloaded while some are remotely referenced.

	:bash:`dvs` :- To create an empty repository which can be used in future runs.

	:bash:`git` :- If data is in the git repo. All git compatible repos are supported - github, bitbucket, gitlab. :bash:`Downloaded`

	:bash:`aws_s3` :- If the data is in AWS s3 bucket. :bash:`Downloaded | Remote`

	:bash:`s3` :- Non aws s3 data source. Like MinIO deployed on internal cluster. :bash:`Downloaded | Remote`

	:bash:`gcs` :- Google cloud storage as data source. :bash:`Downloaded`

	:bash:`nfs` :- External NFS server as data source. :bash:`Remote`

	:bash:`redshift` :- Redshift as data source. :bash:`Remote`

	:bash:`k8svolume` :- Kubernetes volume as data source. :bash:`Remote`

        :bash:`hostpath` :- If data is in a path in host machine. :bash:`Remote`
	
	:bash:`sql` :- sql dataset source :bash:`Remote`

    """

    GIT_ACCESS_OPTS = ["apikey", "sshkey", "password"]
    """
	List of authentication options supported for git data source.

	:bash:`apikey` :- Github APIKey based authentication. This must have permission on the repo to clone and checkout.

	:bash:`sshkey` :- Git SSH key based authentication.

	:bash:`password` :- Standard username/password based.

    """

    def __init__(
            self,
            user,
            name=generate("dataset"),
            remote=False,
            tags=None):
        self.k8svolume = DatumModelK8svolume(name=None)

        self.sql = SQLAccessInfo(
            host=None,
            port=None,
            username=None,
            password=None,
            database=None,
            odbc_connection_string=None,
            jdbc_connection_string=None,
            cacert=None,
            provider=None)

        self.redshift = RedshiftAccessInfo(
            endpoint=None,
            username=None,
            password=None,
            database=None,
            region=None,
            cacert=None,
            insecure_ssl=None)

        self.nfsaccess = NFSAccessInfo(server=None, path=None)

        self.gcssecret = RepoGCSAccessInfoSecret(name=None, content=None)
        self.gcsaccess = GCSAccessInfo(
            bucket=None, prefix=None, secret=self.gcssecret)

        self.s3access = S3AccessCredentials(
            access_key_id=None,
            access_key=None,
            bucket=None,
            prefix=None,
            endpoint=None)

        self.gitcreds = GitAccessCredentials(
            username=None,
            password=None,
            apikey=None,
            sshkey=None,
            private=True)
        self.gitaccess = GitAccessInfo(
            path=None, url=None, branch=None, credentials=self.gitcreds)
        self.hostpath = DatumModelHostpath(
            path=None)

        self.datum = DatumModel(
            name=None,
            tags=None,
            _class='dataset',
            dvs=None,
            source='dvs',
            url=None,
            remote=remote,
            gitaccess=self.gitaccess,
	    sql=self.sql,
            s3access=self.s3access,
            nfsaccess=self.nfsaccess,
            gcsaccess=self.gcsaccess,
            hostpath=self.hostpath,
            redshift=self.redshift)
        self.extract = False
        self.update_basic(user, name, tags)

    def update_basic(self, user, name, tags):
        tags = list_of_strs(tags)

        self.user = user
        self.name = name

        self.datum.name = name
        self.datum.tags = tags

[docs]    def update_dataset_source(self, source=DATASET_SOURCES[0]):
        """
            Method to update the source for this dataset.
            It should be one of the choice mentioned in DATASET_SOURCES
            Default value is **git**
        """
        self.datum.source = source

[docs]    def update_git_details(
            self,
            url,
            branch=None,
            authopt=GIT_ACCESS_OPTS[0],
            authval=None):
        """
            Method to update the details of git datasource.

            *Inputs*

                url
                    A valid Git URL. Following are considered as valid URLs.

                    - CloneURL : https://github.com/oneconvergence/dkube.git

                    - TreeURL : https://github.com/oneconvergence/dkube/tree/2.1.dev/dkube

                    - BlobURL : https://github.com/oneconvergence/dkube/blob/2.1.dev/dkube/sdk/api.py

                    - ZipURL : https://github.com/oneconvergence/dkube/archive/2.1.dev.zip

                branch
                    Valid branch of git repo. If not provided then **master** branch is used by default.

                authopt
                    One of the valid option from **GIT_ACCESS_OPTS**

                authval
                    Value corresponding to the authopt
        """

        self.datum.source = "git"
        self.datum.url = url
        self.gitaccess.url = url
        self.gitaccess.branch = branch

        self.gitcreds.username = self.user

        if authopt == 'apikey':
            self.gitcreds.apikey = authval
        elif authopt == 'password':
            self.gitcreds.password = authval
        elif authopt == 'sshkey':
            self.gitcreds.sshkey = authval

[docs]    def update_awss3_details(self, bucket, prefix, key, secret):
        """
            Method to update details of aws s3 data source.

            *Inputs*

                bucket
                    Valid bucket in aws s3

                prefix
                    Path to an object in the bucket. Dkube will fetch recursively all objects under this prefix.

                key
                    AWS s3 access key id

                secret
                    AWS s3 access key secret
        """

        self.datum.source = "aws_s3"
        self.datum.url = "/" + bucket + "/" + prefix
        self.s3access.bucket = bucket
        self.s3access.prefix = prefix
        self.s3access.access_key_id = key
        self.s3access.access_key = secret

[docs]    def update_s3_details(self, endpoint, bucket, prefix, key, secret):
        """
            Method to update details of s3 data source like minio.

            *Inputs*

                bucket
                    Valid bucket name in s3 store

                prefix
                    Path to an object in the bucket. Dkube will fetch recursively all objects under this prefix.

                key
                    S3 access key id

                secret
                    s3 access key secret
        """

        self.datum.source = "s3"
        self.s3access.endpoint = endpoint
        self.s3access.prefix = prefix
        self.s3access.bucket = bucket
        self.s3access.access_key_id = key
        self.s3access.access_key = secret

[docs]    def update_gcs_details(self, bucket, prefix, key, secret):
        """
            Method to update details of google cloud storage.

            *Inputs*

                bucket
                    Valid bucket in GCS

                prefix
                    Path to an object in bucket. Dkube will fetch recursively all objects under this prefix.

                key
                    Name of the GCS secret

                secret
                    Content of the secret
        """

        self.datum.source = "gcs"
        self.gcsaccess.bucket = bucket
        self.gcsaccess.prefix = prefix
        self.gcssecret.name = key
        self.gcssecret.content = secret

[docs]    def update_nfs_details(self, server, path="/"):
        """
            Method to update details of nfs data source.

            *Inputs*

                server
                    IP address of the nfs server.

                path
                    Path in the nfs export. This path is directly mounted for the user program.

        """

        self.datum.source = "nfs"
        self.nfsaccess.path = path
        self.nfsaccess.server = server

[docs]    def update_redshift_details(
            self,
            endpoint,
            database,
            user=None,
            password=None):
        """
            Method to update details of redshift data source.

            *Inputs*

                endpoint
                    Redshift endpoint

                password
                    Login password. Username is picked up from the login name in DKube.

                database
                    Database in redshift to connect to.

                region
                    AWS region in which the redshift is setup.

        """

        self.datum.source = "redshift"
        self.redshift.endpoint = endpoint
        if user:
            self.redshift.username = user
        else:
            self.redshift.username = self.user
        self.redshift.password = password
        self.redshift.database = database
        self.redshift.insecure_ssl = True
        self.datum.remote = True

[docs]    def update_k8svolume_details(self, name):
        """
            Method to update details of k8s volume data source.

            *Inputs*

                name
                    Name of the kubernetes volume. Volume should not be already **Bound**.
        """

        self.datum.source = "k8svolume"
        self.k8svolume.name = name

[docs]    def update_hostpath_details(self, path):
        """
            Method to update details of hostpath.

            *Inputs*

                path
                    Location in the host machine where the data is stored.
        """

        self.datum.source = "hostpath"
        self.datum.remote = True
        self.hostpath.path = path

[docs]    def update_puburl_details(self, url, extract):
        """
            Method to update details of pub_url data source.

            *Inputs*

                url
                    pub_url of the data

                extract
                    if set to True, data will be extracted
        """

        self.datum.source = "pub_url"
        self.datum.url = url
        self.extract = extract
	
[docs]    def update_sql_details(self,provider,host='',port=None,username=None,password=None,database='',odbc_connection_string=None,jdbc_connection_string=None):
        """
            Method to update details of sql data source.

            *Inputs*

                provider
                    possible values are 'oracle','mysql' and 'mssql' (string)
                
                host
                    host address (string)

                port
                    port number (integer)

                username
                    username for accessing the database (string)

                password
                    password for accessing the database (string)

                database
                    name of the sql database (string)

                odbc_connection_string
                    odbc connection string 

                jdbc_connection_string
                    jdbc connection string
        """

        self.datum.source = "sql"
        self.datum.url = "sql:"+ host + ":" + str(port) + ":" + database
        self.sql.provider = provider
        self.sql.host = host
        self.sql.port = port
        self.sql.username = username
        self.sql.password = password
        self.sql.database = database
        self.sql.odbc_connection_string = odbc_connection_string
        self.sql.jdbc_connection_string = jdbc_connection_string