Connections
googleCloudStorage
Commentary
added in 0.9.0
Connects to Google Cloud Storage storage.
You must set the environment variable GOOGLE_APPLICATION_CREDENTIALS
to the location of a credential configuration file within your container. You can set Docker environment variables with either -e
or --env-file
, similar to how the license environment variables are passed.
The target bucket must exist prior to writing data.
A new blob will be created following the default batch rate, which can be overriden by time, elements, or serialized bytes. 1
Blobs are created with the key name <blob-prefix>-<ulid>.<file-suffix>
, where ulid
is a monotically increasing ULID. This means all blobs in the bucket are sortable by key name.
You can choose from a range of serialization formats and compression types 2.
This connection also works with GCS-compatible services like fake-gcs-server 3.
Examples
Configuring the connection
Set projectId
to configure where the data is written.
{
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}
Set the batch rate
By default, a new blob will be created every 500
ms or 5000
elements, whichever happens first. You can also optionally create a new blob after a certain amount of serialized bytes have been accumulated.
To override these:
- use
lingerMs
to set the limit on time - use
batchElements
to set it on number of events - use
batchBytes
to set it on size
{
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
},
"batchConfigs": {
"lingerMs": 2000,
"batchElements": 10000,
"batchBytes": 5242880
}
}
}
}
Set blob content
Use bucket
to set the bucket, bucketConfigs
to set the blob format, and data
to set the content.
format
can be any of json
, jsonl
, and parquet
. Additionally, pretty
set to true
will cause json
to pretty print.
Additionally, compression
can be set to gzip
.
{
"generators": [
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "foo-",
"format": "jsonl"
},
"data": {
"a": {
"_gen": "uuid"
},
"b": {
"_gen": "boolean"
}
}
}
],
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}
Overriding the host
Set host
to override the endpoint, perhaps to use fake-gcs-server for local testing.
{
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject",
"host": "https://0.0.0.0:4443/storage/v1/b"
}
}
}
}
Serializing with Parquet
Set format
to parquet
. By default, ShadowTraffic will attempt to guess your Parquet schema. But if it can't, specify it manually with avroSchemaHint
.
{
"generators": [
{
"bucket": "sandbox",
"bucketConfigs": {
"format": "parquet",
"blobPrefix": "part-"
},
"data": {
"luckyNumber": {
"_gen": "oneOf",
"choices": [
1,
2,
3
]
}
},
"localConfigs": {
"avroSchemaHint": {
"data": {
"type": "record",
"name": "MyRecordName",
"fields": [
{
"name": "luckyNumber",
"type": "int"
}
]
}
}
}
}
],
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}
Multiple lookups
If you have multiple generators writing to the same bucket and want to execute a lookup
against only one of them, use the optional blobPrefix
parameter in lookups against Google Cloud Storage. This narrows lookups to only generates that write against the specified blob prefix.
{
"generators": [
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "writer-a-",
"format": "jsonl"
},
"data": {
"specialNumber": {
"_gen": "uniformDistribution",
"bounds": [
1,
10
],
"decimals": 0
}
}
},
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "writer-b-",
"format": "jsonl"
},
"data": {
"verySpecialNumber": {
"_gen": "uniformDistribution",
"bounds": [
1,
20
],
"decimals": 0
}
}
},
{
"bucket": "sandbox",
"bucketConfigs": {
"blobPrefix": "writer-c-",
"format": "jsonl"
},
"data": {
"_gen": "lookup",
"bucket": "sandbox",
"blobPrefix": "writer-a-",
"path": [
"data"
]
}
}
],
"connections": {
"gcs": {
"kind": "googleCloudStorage",
"connectionConfigs": {
"projectId": "myProject"
}
}
}
}
Specification
Connection JSON schema
{
"type": "object",
"properties": {
"kind": {
"type": "string",
"const": "googleCloudStorage"
},
"batchConfigs": {
"type": "object",
"properties": {
"lingerMs": {
"type": "integer",
"minimum": 0
},
"batchElements": {
"type": "integer",
"minimum": 1
},
"batchBytes": {
"type": "integer",
"minimum": 1
}
}
},
"connectionConfigs": {
"type": "object",
"properties": {
"host": {
"type": "string"
},
"projectId": {
"type": "string"
}
},
"required": [
"projectId"
]
}
}
}
Generator JSON schema
{
"type": "object",
"properties": {
"connection": {
"type": "string"
},
"name": {
"type": "string"
},
"bucket": {
"type": "string"
},
"data": {
"type": "object"
},
"localConfigs": {
"type": "object",
"properties": {
"throttleMs": {
"oneOf": [
{
"type": "number",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"maxEvents": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"kafkaKeyProtobufHint": {
"type": "object",
"properties": {
"schemaFile": {
"type": "string"
},
"message": {
"type": "string"
}
},
"required": [
"schemaFile",
"message"
]
},
"jsonSchemaHint": {
"type": "object"
},
"maxBytes": {
"type": "integer",
"minimum": 1
},
"discard": {
"type": "object",
"properties": {
"rate": {
"type": "number",
"minimum": 0,
"maximum": 1
}
},
"required": [
"rate"
]
},
"repeat": {
"type": "object",
"properties": {
"rate": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"times": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
},
"required": [
"rate",
"times"
]
},
"protobufSchemaHint": {
"type": "object",
"patternProperties": {
"^.*$": {
"type": "object",
"properties": {
"schemaFile": {
"type": "string"
},
"message": {
"type": "string"
}
},
"required": [
"schemaFile",
"message"
]
}
}
},
"maxHistoryEvents": {
"type": "integer",
"minimum": 0
},
"maxMs": {
"type": "integer",
"minimum": 0
},
"time": {
"type": "integer"
},
"events": {
"type": "object",
"properties": {
"exactly": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
}
},
"delay": {
"type": "object",
"properties": {
"rate": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"ms": {
"oneOf": [
{
"type": "integer",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
},
"required": [
"rate",
"ms"
]
},
"history": {
"type": "object",
"properties": {
"events": {
"type": "object",
"properties": {
"max": {
"type": "integer",
"minimum": 0
}
}
}
}
},
"avroSchemaHint": {
"type": "object"
},
"throttle": {
"type": "object",
"properties": {
"ms": {
"oneOf": [
{
"type": "number",
"minimum": 0
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
}
}
},
"throughput": {
"oneOf": [
{
"type": "integer",
"minimum": 1
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"timeMultiplier": {
"oneOf": [
{
"type": "number"
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"kafkaValueProtobufHint": {
"type": "object",
"properties": {
"schemaFile": {
"type": "string"
},
"message": {
"type": "string"
}
},
"required": [
"schemaFile",
"message"
]
}
}
},
"bucketConfigs": {
"type": "object",
"properties": {
"blobPrefix": {
"oneOf": [
{
"type": "string"
},
{
"type": "object",
"properties": {
"_gen": {
"type": "string"
}
},
"required": [
"_gen"
]
}
]
},
"format": {
"type": "string",
"enum": [
"json",
"jsonl",
"parquet"
]
},
"pretty": {
"type": "boolean"
},
"compression": {
"type": "string",
"enum": [
"gzip"
]
}
},
"required": [
"blobPrefix",
"format"
]
}
},
"required": [
"bucket",
"data",
"bucketConfigs"
]
}