Terraform Data Pipelines

July 23, 2024

At Canvas we move a lot of data, and one of the busiest highways runs from S3 to Snowflake. S3 is where we stage our data - from ETL pipelines, webhooks, even customers putting files there directly.

Some of our customers queries we can serve from S3 directly, but for larger datasets we move the data into Snowflake and serve queries from there.

Fortunately Snowflake has excellent support for loading data from S3 with their stages. Unfortunately you'd have to navigate your way through 2000 words of steps to perform between the AWS and Snowflake UIs to use it.

I think this is a problem with Big Data broadly. Most of the industry has not adopted modern software engineering best practices, including infrastructure as code. IaC is a far more repeatable and auditable way to deploy infrastructure than following UI flows.

To their credit, big players like Snowflake and Fivetran offer Terraform providers, there's just not much usage or discussion around them. In that spirit, here's a recipe we've used dozens of times to stage data from S3 to Snowflake.

variable "aws_region" {
  description = "AWS region"
  type        = string
}

variable "aws_profile" {
  description = "AWS profile"
  type        = string
}

variable "snowflake_account" {
  description = "Snowflake account identifier"
  type        = string
}

variable "snowflake_user" {
  description = "Snowflake user"
  type        = string
}

variable "snowflake_password" {
  description = "Snowflake password"
  type        = string
  sensitive   = true
}

variable "s3_bucket_name" {
  description = "Name to give the S3 bucket"
  type        = string
}

variable "snowflake_database" {
  description = "Snowflake database where the stage will be created"
  type        = string
}

variable "snowflake_schema" {
  description = "Snowflake schema where the stage will be created"
  type        = string
}

variable "aws_iam_user_arn" {
  description = "Initially set this to your ARN as a placeholder. After step 3, update with the output from snowflake_s3_integration_storage_aws_iam_user_arn"
  type        = string
}

variable "snowflake_external_id" {
  description = "Snowflake External ID for AWS IAM Role. Can initially be set to any placeholder value. After step 3, update with the output from snowflake_s3_integration_storage_aws_external_id"
  type        = string
}

# Add this block to get the current AWS account ID
data "aws_caller_identity" "current" {}

# Work around for cyclic dependency
# https://stackoverflow.com/questions/68311357/how-to-create-a-snowflake-storage-integration-with-aws-s3-with-terraform
locals {
  role_name = "snowflake_s3_role"
  role_arn  = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${local.role_name}"
}

terraform {
  required_providers {
    snowflake = {
      source  = "Snowflake-Labs/snowflake"
      version = "~> 0.96"
    }
  }
}

provider "aws" {
  region  = var.aws_region # can also set from env var AWS_REGION
  profile = var.aws_profile # can also set from env var AWS_PROFILE
}

provider "snowflake" {
  account  = var.snowflake_account # can also set from env var SNOWFLAKE_ACCOUNT
  user     = var.snowflake_user # can also set from env var SNOWFLAKE_USER
  password = var.snowflake_password # can also set from env var SNOWFLAKE_PASSWORD
}

resource "aws_s3_bucket" "your_bucket_identifier" {
  bucket = var.s3_bucket_name
}

# Step 1: Configure access permissions for the S3 bucket
# If you only want to Snowflake to access a specific prefix adjust accordingly

resource "aws_iam_policy" "snowflake_s3_policy" {
  name        = "snowflake_s3_policy"
  description = "Policy for accessing specific S3 bucket and prefix"

  policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Effect = "Allow"
        Action = [
          "s3:GetObject",
          "s3:GetObjectVersion"
        ]
        Resource = "${aws_s3_bucket.your_bucket_identifier.arn}/*"
      },
      {
        Effect = "Allow"
        Action = [
          "s3:ListBucket",
          "s3:GetBucketLocation"
        ]
        Resource = "${aws_s3_bucket.your_bucket_identifier.arn}"
      }
    ]
  })
}

# Step 2: Create the IAM Role in AWS

resource "aws_iam_role" "snowflake_s3_role" {
  name = local.role_name

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Sid    = "GrantSnowflakeAccess"
        Effect = "Allow"
        Principal = {
          AWS = snowflake_storage_integration.snowflake_s3_integration.storage_aws_iam_user_arn
        }
        Action = "sts:AssumeRole"
        Condition = {
          StringEquals = {
            "sts:ExternalId" = snowflake_storage_integration.snowflake_s3_integration.storage_aws_external_id
          }
        }
      }
    ]
  })
}

resource "aws_iam_role_policy_attachment" "snowflake_s3_policy_attachment" {
  role       = aws_iam_role.snowflake_s3_role.name
  policy_arn = aws_iam_policy.snowflake_s3_policy.arn
}

# Step 3: Create a Cloud Storage Integration in Snowflake

resource "snowflake_storage_integration" "snowflake_s3_integration" {
  name    = "snowflake_s3_integration"
  type    = "EXTERNAL_STAGE"
  enabled = true
  storage_allowed_locations = ["*"]
  storage_provider         = "S3"
  storage_aws_role_arn     = local.role_arn
}

resource "snowflake_stage" "snowflake_s3_stage" {
  name     = "SNOWFLAKE_S3_STAGE"
  database = var.snowflake_database
  schema   = var.snowflake_schema
  url = "s3://${aws_s3_bucket.your_bucket_identifier.bucket}/"
  storage_integration = snowflake_storage_integration.snowflake_s3_integration.name
  file_format = "type = parquet" // adjust to your file format
}

# Step 4: Retrieve the AWS IAM User for your Snowflake Account¶
# Update snowflake_s3_role with the outputs below

output "snowflake_s3_integration_storage_aws_iam_user_arn" {
  value = snowflake_storage_integration.snowflake_s3_integration.storage_aws_iam_user_arn
}
output "snowflake_s3_integration_storage_aws_external_id" {
  value = snowflake_storage_integration.snowflake_s3_integration.storage_aws_external_id
}