data-integrations
diff --git a/‎src/main/java/io/cdap/plugin/gcp/dataplex/common/util/DataplexConstants.java‎
Lines changed: 2 additions & 0 deletions b/‎src/main/java/io/cdap/plugin/gcp/dataplex/common/util/DataplexConstants.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/main/java/io/cdap/plugin/gcp/dataplex/common/util/DataplexUtil.java‎
Lines changed: 257 additions & 8 deletions b/‎src/main/java/io/cdap/plugin/gcp/dataplex/common/util/DataplexUtil.java‎
Lines changed: 257 additions & 8 deletions
@@ -30,4 +30,6 @@ public final class DataplexConstants {
   public static final String NONE = "none";
   public static final String BIGQUERY_DATASET_ASSET_TYPE = "BIGQUERY_DATASET";
   public static final String STORAGE_BUCKET_ASSET_TYPE = "STORAGE_BUCKET";
+  public static final String STORAGE_BUCKET_PARTITION_KEY = "ts";
+  public static final String STORAGE_BUCKET_PATH_PREFIX = "gs://";
 }
@@ -17,22 +17,31 @@
 package io.cdap.plugin.gcp.dataplex.common.util;
 
 import com.google.api.gax.core.FixedCredentialsProvider;
+import com.google.api.gax.paging.Page;
 import com.google.auth.oauth2.GoogleCredentials;
-import com.google.auth.oauth2.ServiceAccountCredentials;
 import com.google.cloud.dataplex.v1.DataplexServiceClient;
 import com.google.cloud.dataplex.v1.DataplexServiceSettings;
+import com.google.cloud.dataplex.v1.Entity;
 import com.google.cloud.dataplex.v1.Job;
 import com.google.cloud.dataplex.v1.JobName;
 import com.google.cloud.dataplex.v1.MetadataServiceClient;
 import com.google.cloud.dataplex.v1.MetadataServiceSettings;
+import com.google.cloud.dataplex.v1.Partition;
+import com.google.cloud.dataplex.v1.Schema.Mode;
+import com.google.cloud.dataplex.v1.Schema.PartitionField;
+import com.google.cloud.dataplex.v1.Schema.PartitionStyle;
+import com.google.cloud.dataplex.v1.Schema.SchemaField;
+import com.google.cloud.dataplex.v1.Schema.Type;
 import com.google.cloud.dataplex.v1.TaskName;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.Storage;
 import io.cdap.cdap.api.data.schema.Schema;
+import io.cdap.cdap.api.data.schema.Schema.LogicalType;
 import io.cdap.cdap.etl.api.FailureCollector;
 import io.cdap.plugin.gcp.bigquery.util.BigQueryTypeSize;
 import io.cdap.plugin.gcp.bigquery.util.BigQueryUtil;
 import io.cdap.plugin.gcp.common.GCPConnectorConfig;
 import io.cdap.plugin.gcp.common.GCPUtils;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.awaitility.Awaitility;
@@ -43,9 +52,9 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Objects;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
-
 import javax.annotation.Nullable;
 
 /**
@@ -230,8 +239,8 @@ private static Schema convertFieldBasedOnStandardType(com.google.cloud.dataplex.
         String error =
           String.format("Entity column '%s' is of unsupported type '%s'.", fieldName, standardType.name());
         String action = String.format("Supported column types are: %s.",
-          BigQueryUtil.BQ_TYPE_MAP.keySet().stream().map(t -> t.getStandardType().name())
-            .collect(Collectors.joining(", ")));
+                                      BigQueryUtil.BQ_TYPE_MAP.keySet().stream().map(t -> t.getStandardType().name())
+                                        .collect(Collectors.joining(", ")));
         if (collector != null) {
           collector.addFailure(error, action);
         } else {
@@ -292,7 +301,7 @@ public static void getJobCompletion(Configuration conf) throws IOException {
     try (DataplexServiceClient dataplexServiceClient = DataplexUtil.getDataplexServiceClient(googleCredentials)) {
       DataplexServiceClient.ListJobsPagedResponse
         jobList = dataplexServiceClient.listJobs(TaskName.newBuilder().setProject(projectID).setLake(lake).
-        setLocation(location).setTask(taskId).build());
+                                                   setLocation(location).setTask(taskId).build());
       Job dataplexJob = jobList.iterateAll().iterator().next();
       try {
         Awaitility.await()
@@ -302,7 +311,7 @@ public static void getJobCompletion(Configuration conf) throws IOException {
           .until(() -> {
             Job currentJob =
               dataplexServiceClient.getJob(JobName.newBuilder().setProject(projectID).setLocation(location)
-                .setLake(lake).setTask(taskId).setJob(dataplexJob.getUid()).build());
+                                             .setLake(lake).setTask(taskId).setJob(dataplexJob.getUid()).build());
             LOG.debug("State of the Job is still " + currentJob.getState());
             return currentJob.getState() != null && !Job.State.RUNNING.equals(currentJob.getState()) &&
               !Job.State.STATE_UNSPECIFIED.equals(currentJob.getState());
@@ -311,7 +320,8 @@ public static void getJobCompletion(Configuration conf) throws IOException {
         throw new IOException("Job timed out.", e);
       }
       Job completedJob = dataplexServiceClient.getJob(JobName.newBuilder().setProject(projectID).setLocation(location)
-        .setLake(lake).setTask(taskId).setJob(dataplexJob.getUid()).build());
+                                                        .setLake(lake).setTask(taskId).setJob(dataplexJob.getUid())
+                                                        .build());
       if (!Job.State.SUCCEEDED.equals(completedJob.getState())) {
         throw new IOException("Job failed with message: " + completedJob.getMessage());
       }
@@ -392,4 +402,243 @@ public static MetadataServiceClient getMetadataServiceClient(GoogleCredentials c
     return metadataServiceClient;
   }
 
+  /**
+   * Get storage format for entity data.
+   *
+   * @param format
+   */
+  public static String getStorageFormatForEntity(String format) {
+    switch (format) {
+      case "avro":
+        return "application/x-avro";
+      case "csv":
+        return "text/csv";
+      case "json":
+        return "application/json";
+      case "orc":
+        return "application/x-orc";
+      case "parquet":
+        return "application/x-parquet";
+      default:
+        return "undefined";
+    }
+
+  }
+
+  /**
+   * Return Dataplex Schema from CDAP Schema.
+   *
+   * @param schema input schema from cdap
+   * @return MetadataServiceClient
+   */
+  public static com.google.cloud.dataplex.v1.Schema getDataplexSchema(Schema schema) throws IOException {
+    com.google.cloud.dataplex.v1.Schema.Builder dataplexSchemaBuilder =
+      com.google.cloud.dataplex.v1.Schema.newBuilder();
+    dataplexSchemaBuilder.setUserManaged(true);
+    if (schema == null) {
+      return dataplexSchemaBuilder.build();
+    }
+    // Since "ts" is used by dataplex sink to create time partitioned layout on GCS, we should remove any
+    // existing columns named "ts" to avoid conflict.
+    dataplexSchemaBuilder.addAllFields(Objects.requireNonNull(schema.getFields()).stream()
+      .filter(avroField -> !avroField.getName().equals(DataplexConstants.STORAGE_BUCKET_PARTITION_KEY))
+      .map(DataplexUtil::toDataplexSchemaField).collect(Collectors.toList()));
+    // Add partitioning scheme to the schema
+    Schema.Field partitionField = Schema.Field.of(DataplexConstants.STORAGE_BUCKET_PARTITION_KEY, schema);
+    dataplexSchemaBuilder.setPartitionStyle(PartitionStyle.HIVE_COMPATIBLE);
+    dataplexSchemaBuilder.addPartitionFields(toDataplexPartitionField(partitionField));
+    return dataplexSchemaBuilder.build();
+  }
+
+  private static SchemaField toDataplexSchemaField(
+    Schema.Field avroField) {
+    SchemaField.Builder fieldBuilder = SchemaField.newBuilder();
+    fieldBuilder.setName(avroField.getName());
+    fieldBuilder.setType(dataplexFieldType(avroField));
+    fieldBuilder.setMode(dataplexFieldMode(avroField));
+    if (avroField.getSchema().getType() == Schema.Type.RECORD) {
+      // Handle nested records. Filtering "ts" column for the same reason in getDataplexSchema
+      fieldBuilder.addAllFields(avroField.getSchema().getFields().stream()
+        .filter(schemaField -> !schemaField.getName().equals(DataplexConstants.STORAGE_BUCKET_PARTITION_KEY))
+        .map(DataplexUtil::toDataplexSchemaField).collect(Collectors.toList()));
+    }
+    return fieldBuilder.build();
+  }
+
+  private static PartitionField toDataplexPartitionField(
+    Schema.Field avroField) {
+    PartitionField.Builder partitionFieldBuilder = PartitionField.newBuilder();
+    partitionFieldBuilder.setName(avroField.getName());
+    // Dataplex supports only partition field type of String for files on GCS.
+    partitionFieldBuilder.setType(Type.STRING);
+    return partitionFieldBuilder.build();
+}
+
+  private static Mode dataplexFieldMode(Schema.Field field) {
+    /*
+     Field modes supported by Dataplex:
+
+     MODE_UNSPECIFIED: Mode unspecified.
+     REQUIRED: The field has required semantics.
+     NULLABLE: The field has optional semantics, and may be null.
+     REPEATED: The field has repeated (0 or more) semantics, and is a list of values.
+    */
+
+    Schema.Type type = field.getSchema().getType();
+    if (type == Schema.Type.ARRAY) {
+      return Mode.REPEATED;
+    } else if (type == Schema.Type.UNION) {
+      for (Schema innerSchema : field.getSchema().getUnionSchemas()) {
+        if (innerSchema.getType() == Schema.Type.NULL) {
+          return Mode.NULLABLE;
+        }
+      }
+    }
+    return Mode.REQUIRED;
+  }
+
+  private static Type dataplexFieldType(Schema.Field field) {
+    /*
+    Field types supported by Dataplex:
+
+    TYPE_UNSPECIFIED SchemaType unspecified.
+    BOOLEAN: Boolean field.
+    BYTE: Single byte numeric field.
+    INT16: 16-bit numeric field.
+    INT32: 32-bit numeric field.
+    INT64: 64-bit numeric field.
+    FLOAT: Floating point numeric field.
+    DOUBLE: Double precision numeric field.
+    DECIMAL: Real value numeric field.
+    STRING: Sequence of characters field.
+    BINARY: Sequence of bytes field.
+    TIMESTAMP: Date and time field.
+    DATE: Date field.
+    TIME: Time field.
+    RECORD: Structured field. Nested fields that define the structure of the map. 
+                      If all nested fields are nullable, this field represents a union.
+    NULL: Null field that does not have values.
+    */
+
+    Schema schema = field.getSchema();
+
+    if (schema.getType() == Schema.Type.UNION) {
+      // Special case for UNION: a union of ["null", "non-null type"] means this is a
+      // nullable field. In Dataplex this will be a field with Mode = NULLABLE and Type = <non-null
+      // type>. So here we have to return the type of the other, non-NULL, element.
+      // A union of 3+ elements is not supported (can't be represented as a Dataplex type).
+      if (schema.isNullable()) {
+        return dataplexPrimitiveFieldType(schema.getNonNullable());
+      }
+      return Type.TYPE_UNSPECIFIED;
+    }
+
+    if (schema.getType() == Schema.Type.ARRAY) {
+      // Special case for ARRAY: check the type of the underlying elements.
+      // In Dataplex this will be a field with Mode = REPEATED and Type = <array element type>.
+      return dataplexPrimitiveFieldType(schema.getComponentSchema());
+    }
+
+    return dataplexPrimitiveFieldType(schema);
+  }
+
+  private static Type dataplexPrimitiveFieldType(Schema schema) {
+    if (schema.getLogicalType() != null) {
+      Type type = dataplexLogicalFieldType(schema);
+      if (type != null) {
+        return type;
+      }
+    }
+
+    Schema.Type avroType = schema.getType();
+    switch (avroType) {
+      case RECORD:
+        return Type.RECORD;
+      case STRING:
+        return Type.STRING;
+      case FLOAT:
+        return Type.FLOAT;
+      case DOUBLE:
+        return Type.DOUBLE;
+      case BOOLEAN:
+        return Type.BOOLEAN;
+      case NULL:
+        return Type.NULL;
+      case BYTES: // BYTES is binary data with variable size.
+        return Type.BINARY;
+      case INT:
+        return Type.INT32;
+      case LONG:
+        return Type.INT64;
+      case UNION: // Shouldn't happen. Unions can not contain other unions as per Avro spec.
+      case ARRAY: // Not supported as a primitive type (e.g. if this is an ARRAY of ARRAYs).
+      case MAP:
+      case ENUM:
+      default:
+        return Type.TYPE_UNSPECIFIED;
+    }
+  }
+
+  private static Type dataplexLogicalFieldType(Schema schema) {
+    LogicalType logicalType = schema.getLogicalType();
+
+    if (logicalType == LogicalType.DECIMAL) {
+      return Type.DECIMAL;
+    } else if (logicalType == LogicalType.DATE) {
+      return Type.DATE;
+    } else if (logicalType == LogicalType.TIME_MICROS
+      || logicalType == LogicalType.TIME_MILLIS) {
+      return Type.TIME;
+    } else if (logicalType == LogicalType.TIMESTAMP_MICROS
+      || logicalType == LogicalType.TIMESTAMP_MILLIS) {
+      return Type.TIMESTAMP;
+    }
+
+    return null;
+  }
+
+  /**
+   * Add partition info to dataplex entity.
+   *
+   * @param entity dataplex entity
+   * @param credentials Google Credentials
+   * @param bucketName
+   * @param tableName
+   * @param project
+   * @throws IOException
+   */
+  public static void addPartitionInfo(Entity entity,
+                                      GoogleCredentials credentials,
+                                      String bucketName, String tableName, String project) throws IOException {
+    Storage storage = GCPUtils.getStorage(project, credentials);
+    String delimiter = "/";
+    String partitionPrefix = DataplexConstants.STORAGE_BUCKET_PARTITION_KEY + "=";
+    Page<Blob> blobs =
+      storage.list(
+        bucketName,
+        Storage.BlobListOption.prefix(tableName + delimiter + partitionPrefix),
+        Storage.BlobListOption.currentDirectory());
+    String lastPartition = null;
+    // blob name example: bq_source_2/ts=2022-08-22-21-52/
+    // Get the last blob's name in the iterator as it is the one that corresponds to the entity that was just
+    // created/updated.
+    for (Blob blob : blobs.iterateAll()) {
+      lastPartition = blob.getName();
+    }
+    // Remove the delimiter from the end of the blob name for creating location string
+    String location = DataplexConstants.STORAGE_BUCKET_PATH_PREFIX + bucketName +
+      delimiter + lastPartition.substring(0, lastPartition.length() - 1);
+    String[] lastPartitionParts = lastPartition.split(delimiter);
+    Partition.Builder dataplexPartitionBuilder =
+      Partition.newBuilder();
+    try (MetadataServiceClient metadataServiceClient =
+           getMetadataServiceClient(credentials)) {
+      Partition partition = dataplexPartitionBuilder
+        .setLocation(location)
+        //extract the date value from blob name (e.g., 2022-08-22-21-52) to pass as partition value
+        .addValues(lastPartitionParts[lastPartitionParts.length - 1].substring(partitionPrefix.length()))
+        .build();
+        metadataServiceClient.createPartition(entity.getName(), partition);
+    }
+  }
 }
Original file line number	Diff line number	Diff line change
`@@ -30,4 +30,6 @@ public final class DataplexConstants {`
`30`	`30`	`public static final String NONE = "none";`
`31`	`31`	`public static final String BIGQUERY_DATASET_ASSET_TYPE = "BIGQUERY_DATASET";`
`32`	`32`	`public static final String STORAGE_BUCKET_ASSET_TYPE = "STORAGE_BUCKET";`
	`33`	`+ public static final String STORAGE_BUCKET_PARTITION_KEY = "ts";`
	`34`	`+ public static final String STORAGE_BUCKET_PATH_PREFIX = "gs://";`
`33`	`35`	`}`