Skip to main content

Connections

googleCloudStorage

Commentary

added in 0.9.0

Connects to Google Cloud Storage storage.

You must set the environment variable GOOGLE_APPLICATION_CREDENTIALS to the location of a credential configuration file within your container. You can set Docker environment variables with either -e or --env-file, similar to how the license environment variables are passed.

The target bucket must exist prior to writing data.

A new blob will be created following the default batch rate, which can be overriden by time, elements, or serialized bytes. 1

Blobs are created with the key name <blob-prefix>-<ulid>.<file-suffix>, where ulid is a monotically increasing ULID. This means all blobs in the bucket are sortable by key name.

You can choose from a range of serialization formats and compression types 2.

This connection also works with GCS-compatible services like fake-gcs-server 3.


Examples

Configuring the connection

Set projectId to configure where the data is written.

{
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}

Set the batch rate

By default, a new blob will be created every 500 ms or 5000 elements, whichever happens first. You can also optionally create a new blob after a certain amount of serialized bytes have been accumulated.

To override these:

  • use lingerMs to set the limit on time
  • use batchElements to set it on number of events
  • use batchBytes to set it on size
{
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
},
"batchConfigs": {
"lingerMs": 2000,
"batchElements": 10000,
"batchBytes": 5242880
}
}
}
}

Set blob content

Use bucket to set the bucket, bucketConfigs to set the blob format, and data to set the content.

format can be any of json, jsonl, and parquet. Additionally, pretty set to true will cause json to pretty print.

Additionally, compression can be set to gzip.

{
"generators": [
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "foo-",
"format": "jsonl"
},
"data": {
"a": {
"_gen": "uuid"
},
"b": {
"_gen": "boolean"
}
}
}
],
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}

Overriding the host

Set host to override the endpoint, perhaps to use fake-gcs-server for local testing.

{
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject",
"host": "https://0.0.0.0:4443/storage/v1/b"
}
}
}
}

Serializing with Parquet

Set format to parquet. By default, ShadowTraffic will attempt to guess your Parquet schema. But if it can't, specify it manually with avroSchemaHint.

{
"generators": [
{
"bucket": "sandbox",
"bucketConfigs": {
"format": "parquet",
"blobPrefix": "part-"
},
"data": {
"luckyNumber": {
"_gen": "oneOf",
"choices": [
1,
2,
3
]
}
},
"localConfigs": {
"avroSchemaHint": {
"data": {
"type": "record",
"name": "MyRecordName",
"fields": [
{
"name": "luckyNumber",
"type": "int"
}
]
}
}
}
}
],
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}

Multiple lookups

If you have multiple generators writing to the same bucket and want to execute a lookup against only one of them, use the optional blobPrefix parameter in lookups against Google Cloud Storage. This narrows lookups to only generates that write against the specified blob prefix.

{
"generators": [
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "writer-a-",
"format": "jsonl"
},
"data": {
"specialNumber": {
"_gen": "uniformDistribution",
"bounds": [
1,
10
],
"decimals": 0
}
}
},
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "writer-b-",
"format": "jsonl"
},
"data": {
"verySpecialNumber": {
"_gen": "uniformDistribution",
"bounds": [
1,
20
],
"decimals": 0
}
}
},
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "writer-c-",
"format": "jsonl"
},
"data": {
"_gen": "lookup",
"bucket": "sandbox",
"blobPrefix": "writer-a-",
"path": [
"data"
]
}
}
],
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}

Specification

Connection JSON schema

{
"type": "object",
"properties": {
"kind": {
"type": "string",
"const": "googleCloudStorage"
},
"batchConfigs": {
"type": "object",
"properties": {
"lingerMs": {
"type": "integer",
"minimum": 0
},
"batchElements": {
"type": "integer",
"minimum": 1
},
"batchBytes": {
"type": "integer",
"minimum": 1
}
}
},
"connectionConfigs": {
"type": "object",
"properties": {
"host": {
"type": "string"
},
"projectId": {
"type": "string"
}
},
"required": [
"projectId"
]
}
}
}

Generator JSON schema

{
"type": "object",
"properties": {
"connection": {
"type": "string"
},
"name": {
"type": "string"
},
"bucket": {
"type": "string"
},
"data": {
"type": "object"
},
"localConfigs": {
"type": "object",
"properties": {
"throttleMs": {
"oneOf": [
{
"type": "number",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"maxEvents": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"kafkaKeyProtobufHint": {
"type": "object",
"properties": {
"schemaFile": {
"type": "string"
},
"message": {
"type": "string"
}
},
"required": [
"schemaFile",
"message"
]
},
"jsonSchemaHint": {
"type": "object"
},
"maxBytes": {
"type": "integer",
"minimum": 1
},
"discard": {
"type": "object",
"properties": {
"rate": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": [
"rate"
]
},
"repeat": {
"type": "object",
"properties": {
"rate": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"times": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
},
"required": [
"rate",
"times"
]
},
"protobufSchemaHint": {
"type": "object",
"patternProperties": {
"^.*$": {
"type": "object",
"properties": {
"schemaFile": {
"type": "string"
},
"message": {
"type": "string"
}
},
"required": [
"schemaFile",
"message"
]
}
}
},
"maxHistoryEvents": {
"type": "integer",
"minimum": 0
},
"maxMs": {
"type": "integer",
"minimum": 0
},
"time": {
"type": "integer"
},
"events": {
"type": "object",
"properties": {
"exactly": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
}
},
"delay": {
"type": "object",
"properties": {
"rate": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"ms": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
},
"required": [
"rate",
"ms"
]
},
"history": {
"type": "object",
"properties": {
"events": {
"type": "object",
"properties": {
"max": {
"type": "integer",
"minimum": 0
}
}
}
}
},
"avroSchemaHint": {
"type": "object"
},
"throttle": {
"type": "object",
"properties": {
"ms": {
"oneOf": [
{
"type": "number",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
}
},
"throughput": {
"oneOf": [
{
"type": "integer",
"minimum": 1
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"timeMultiplier": {
"oneOf": [
{
"type": "number"
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"kafkaValueProtobufHint": {
"type": "object",
"properties": {
"schemaFile": {
"type": "string"
},
"message": {
"type": "string"
}
},
"required": [
"schemaFile",
"message"
]
}
}
},
"bucketConfigs": {
"type": "object",
"properties": {
"blobPrefix": {
"oneOf": [
{
"type": "string"
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"format": {
"type": "string",
"enum": [
"json",
"jsonl",
"parquet"
]
},
"pretty": {
"type": "boolean"
},
"compression": {
"type": "string",
"enum": [
"gzip"
]
}
},
"required": [
"blobPrefix",
"format"
]
}
},
"required": [
"bucket",
"data",
"bucketConfigs"
]
}