Android Map | Article Map
Terraform @ Scale - 第 5b 部分: API Gateways

Color logo   no background

    Terraform @ Scale - 第 5b 部分: API Gateways

    在上一篇文章 5a 中,我们看到大规模的 Terraform Rollouts 很快会触碰到 API 限制,例如当 DR 测试需要并行创建上百个资源时,429 错误会像雪崩一样触发大量 Retries。本篇续篇正是从这里切入,展示如何通过 Oracle Cloud Infrastructure 的 API Gateway 以及 Amazon API Gateway 来有意识地管理这些限制,实现干净的可观测性,并通过「Policy as Code」将其落实到稳定的运营实践中。

    API Gateway: 最后的武器?

    API-Gateways 帮助我们使 API 限制变得可控。正确使用时,它们能够汇聚 API 调用、强制实施配额和 Throttling、提供一致的可观测性数据,并在运营和治理上形成一个集中入口。

    对我们来说,最重要的是:一个 Gateway 不仅仅是转移 Rate-Limit 问题,而是使其能够在团队、部署和路由层面得到主动控制。

    Oracle Cloud Infrastructure 中,您可以通过 Usage Plans 和 Entitlements 设置技术护栏。这些规则直接作用于 API Gateway 部署,例如每秒的硬性速率限制,以及每分钟或每月的配额。为保证执行和透明性,服务还提供了如 HttpResponses 这样的专用指标,并带有 deploymentId 和 httpStatusCode 维度,可以干净地接入告警系统。(Oracle Documentation)。

    服务日志类别 accessexecution 是该服务预设的通道;它们直接关联到 API 部署,相较于传统的 Bucket 日志归档,这是首选方式。(Oracle Documentation

    以下是一个 OCI 示例(AWS 示例将在后文展示):


    # Terraform >= 1.10, OCI Provider 7.14.0
    terraform {
      required_version = ">= 1.10"
      required_providers {
        oci = { source = "oracle/oci", version >= "7.14.0" }
      }
    }
    
    provider "oci" {
      region = var.region
    }
    
    variable "region" {
      type        = string
      description = "OCI region, e.g., eu-frankfurt-1"
      validation {
        condition     = can(regex("^[a-z]+-[a-z0-9]+-[0-9]+$", var.region))
        error_message = "Region must match a pattern like 'eu-frankfurt-1'."
      }
    }
    
    variable "compartment_id" {
      type        = string
      description = "Compartment OCID used for gateway, logs, and alarms"
    }
    
    # Optional: Many organizations manage the API deployment separately.
    # We intentionally reference it via a variable to keep the example focused.
    variable "api_deployment_id" {
      type        = string
      description = "OCID of the API Gateway deployment"
      validation {
        condition     = can(regex("^ocid1\\..+", var.api_deployment_id))
        error_message = "api_deployment_id must be a valid OCID."
      }
    }
    
    # Enable service logs for 'access' and 'execution'
    resource "oci_logging_log_group" "apigw" {
      compartment_id = var.compartment_id
      display_name   = "apigw-logs"
    }
    
    resource "oci_logging_log" "apigw_access" {
      log_group_id = oci_logging_log_group.apigw.id
      display_name = "apigateway-access"
      log_type     = "SERVICE"
      is_enabled   = true
    
      configuration {
        source {
          category = "access"
          resource = var.api_deployment_id
          service  = "apigateway"
        }
      }
    }
    
    resource "oci_logging_log" "apigw_execution" {
      log_group_id = oci_logging_log_group.apigw.id
      display_name = "apigateway-execution"
      log_type     = "SERVICE"
      is_enabled   = true
    
      configuration {
        source {
          category = "execution"
          resource = var.api_deployment_id
          service  = "apigateway"
        }
      }
    }
    
    # Usage plan with rate limit & minute quota
    resource "oci_apigateway_usage_plan" "team_plan" {
      compartment_id = var.compartment_id
      display_name   = "team-standard-plan"
    
      entitlements {
        name        = "default"
        description = "Standard quota for CI runs"
    
        rate_limit {
          unit  = "SECOND"
          value = 50
        }
    
        quota {
          unit                 = "MINUTE"
          value                = 2000
          reset_policy         = "CALENDAR"
          operation_on_breach  = "REJECT"
        }
    
        targets {
          deployment_id = var.api_deployment_id
        }
      }
    
      lifecycle {
        prevent_destroy = true
      }
    }

    Amazon API Gateway 中,您可以结合三种手段:Stage 与 Method Throttling、带有 API Keys 的 Usage Plans,以及基于速率的 AWS WAF 规则来实现 IP 聚合控制。CloudWatch 指标 4XXError5XXError 能够在 Stage 层面提供一个稳健的早期预警系统。

    重要提示: AWS WAFv2 目前只能与 REST-API Stages 关联,无法应用于 HTTP APIs。(AWS Documentation, Terraform Registry


    # Amazon API Gateway (REST) – stage throttling, usage plan, WAF
    terraform {
      required_version = ">= 1.10"
      required_providers {
        aws = { source = "hashicorp/aws", version = ">= 5.0" }
      }
    }
    
    provider "aws" {
      region = var.aws_region
    }
    
    data "aws_region" "current" {}
    
    resource "aws_api_gateway_rest_api" "tf_api" {
      name = "terraform-at-scale"
    }
    
    resource "aws_api_gateway_resource" "status" {
      rest_api_id = aws_api_gateway_rest_api.tf_api.id
      parent_id   = aws_api_gateway_rest_api.tf_api.root_resource_id
      path_part   = "status"
    }
    
    resource "aws_api_gateway_method" "get_status" {
      rest_api_id   = aws_api_gateway_rest_api.tf_api.id
      resource_id   = aws_api_gateway_resource.status.id
      http_method   = "GET"
      authorization = "NONE"
    }
    
    resource "aws_api_gateway_integration" "get_status_mock" {
      rest_api_id = aws_api_gateway_rest_api.tf_api.id
      resource_id = aws_api_gateway_resource.status.id
      http_method = aws_api_gateway_method.get_status.http_method
      type        = "MOCK"
    }
    
    resource "aws_api_gateway_deployment" "this" {
      rest_api_id = aws_api_gateway_rest_api.tf_api.id
      depends_on  = [aws_api_gateway_integration.get_status_mock]
    }
    
    resource "aws_api_gateway_stage" "prod" {
      rest_api_id   = aws_api_gateway_rest_api.tf_api.id
      deployment_id = aws_api_gateway_deployment.this.id
      stage_name    = "prod"
    
      method_settings {
        resource_path           = "/*"
        http_method             = "*"
        metrics_enabled         = true
        logging_level           = "INFO"
        data_trace_enabled      = false
        throttling_burst_limit  = 100
        throttling_rate_limit   = 50
      }
    }
    
    resource "aws_api_gateway_usage_plan" "plan" {
      name = "team-standard-plan"
    
      api_stages {
        api_id = aws_api_gateway_rest_api.tf_api.id
        stage  = aws_api_gateway_stage.prod.stage_name
      }
    
      throttle_settings {
        burst_limit = 100
        rate_limit  = 50
      }
    
      quota_settings {
        limit  = 2000
        period = "DAY"
      }
    }
    
    resource "aws_api_gateway_api_key" "ci_key" {
      name    = "ci-runs"
      enabled = true
      # If 'value' is omitted, the service generates a secure key automatically.
    }
    
    resource "aws_api_gateway_usage_plan_key" "ci_key_bind" {
      key_id        = aws_api_gateway_api_key.ci_key.id
      key_type      = "API_KEY"
      usage_plan_id = aws_api_gateway_usage_plan.plan.id
    }
    
    # WAFv2 rate-based rule (REGIONAL) – only for REST API stages, not HTTP APIs
    resource "aws_wafv2_web_acl" "apigw_waf" {
      name        = "apigw-waf"
      description = "Rate limit per source IP"
      scope       = "REGIONAL"
    
      default_action { allow {} }
    
      rule {
        name     = "rate-limit"
        priority = 1
        action { block {} }
    
        statement {
          rate_based_statement {
            limit              = 500
            aggregate_key_type = "IP"
          }
        }
    
        visibility_config {
          cloudwatch_metrics_enabled = true
          metric_name                = "apigw-waf"
          sampled_requests_enabled   = true
        }
      }
    
      visibility_config {
        cloudwatch_metrics_enabled = true
        metric_name                = "apigw-waf"
        sampled_requests_enabled   = true
      }
    }
    
    resource "aws_wafv2_web_acl_association" "stage_assoc" {
      resource_arn = "arn:aws:apigateway:${data.aws_region.current.name}::/restapis/${aws_api_gateway_rest_api.tf_api.id}/stages/${aws_api_gateway_stage.prod.stage_name}"
      web_acl_arn  = aws_wafv2_web_acl.apigw_waf.arn
    }
    
    

    Stage 范围的 Throttling、Usage Plans 以及 WAF 关联是 AWS 端的核心构建模块。CloudWatch 还提供了包括 4XXError 在内的指标,并带有 ApiNameStage 维度,这使得在每个 Stage 层面触发告警变得更加简单。(AWS Documentation

    Testing 与验证 (Terraform 1.10+)

    为了实现快速且可重复的安全保障,推荐使用 Terraform 的原生 Testing-Framework。通过 Mock-Provider 封装外部依赖,并使用 Assertions 来检查项目特定规则,例如最大批处理大小或在限制过低时的行为。

    专业提示: 请有意识地编写简短且有表现力的测试,用于增强模块对错误配置的防护。(HashiCorp Developer


    # tests/api_limits.tftest.hcl
    
    test {
      # optional name and timeouts can be added here
    }
    
    variables {
      # Global default variables for all runs in this test file
      max_batch_size = 50
    }
    
    # Example: The plan must never try to create more than 50 new resources
    run "enforce_small_batches" {
      command = plan
    
      assert {
        condition = length([for rc in run.plan.resource_changes : rc if contains(rc.change.actions, "create")]) <= var.max_batch_size
        error_message = "Too many new resources in a single run – split the deployment into smaller batches."
      }
    }
    
    # Example: We expect a failure of a named precondition
    # (Preconditions are defined in your modules/resources)
    run "expect_precondition_failure" {
      command = plan
      expect_failures = [
        precondition.api_limits_reasonable
      ]
    }

    实践中的提示:

    • Assertions 必须是单行表达式,
    • expect_failures 引用的是已命名的 Preconditions,而不是一般的类型错误。
    • Ephemeral 资源截至目前(Terraform 1.12.0)主要适用于临时 Token 和查询,但不能作为 Mocks 的通用替代方案。

    Monitoring + Alerting

    可观测性是您的 API 限制策略的运营支柱。

    在 OCI 上,最可靠的方式是直接使用 API Gateway 的服务指标,并结合监控平台的告警。维度 deploymentIdhttpStatusCode 可用于唯一过滤 429 响应。MQL 语法如下,请注意维度名称的正确性:(Oracle Documentation


    # OCI: Alarm on sustained HTTP 429 responses at deployment level
    resource "oci_ons_notification_topic" "ops" {
      compartment_id = var.compartment_id
      name           = "ops-alerts"
    }
    
    resource "oci_ons_subscription" "ops_mail" {
      compartment_id = var.compartment_id
      topic_id       = oci_ons_notification_topic.ops.id
      protocol       = "EMAIL"
      endpoint       = var.alert_email
    }
    
    resource "oci_monitoring_alarm" "apigw_429" {
      compartment_id        = var.compartment_id
      metric_compartment_id = var.compartment_id
      display_name          = "APIGW 429 bursts"
      is_enabled            = true
      severity              = "CRITICAL"
      destinations          = [oci_ons_notification_topic.ops.id]
      message_format        = "ONS_OPTIMIZED"
      pending_duration      = "PT1M"  # 1 minute
      resolution            = "1m"
    
      # Correct dimensions according to API Gateway metrics: deploymentId, httpStatusCode
      query = <<-EOT
        HttpResponses[1m]{deploymentId="${var.api_deployment_id}", httpStatusCode="429"}.sum() > 5
      EOT
    
      body = "Increased rate of HTTP 429 on API Gateway deployment: {{triggerValue}}/min"
    }

    在 AWS 上,您可以定义简单且稳健的告警,针对 4XXError5XXError,并辅以 Stage 范围的 Throttling。在实际运行中,基于 4XXError 的告警触发得更早更广,而 WAF 的速率限制则用于拦截突发的流量峰值。(AWS Documentation


    # AWS: CloudWatch alarm on 4XX errors (stage-wide)
    resource "aws_cloudwatch_metric_alarm" "api_4xx_spike" {
      alarm_name          = "apigw-prod-4xx-spike"
      comparison_operator = "GreaterThanThreshold"
      evaluation_periods  = 1
      period              = 60
      statistic           = "Sum"
      threshold           = 50
      namespace           = "AWS/ApiGateway"
      metric_name         = "4XXError"
    
      dimensions = {
        ApiName = aws_api_gateway_rest_api.tf_api.name
        Stage   = aws_api_gateway_stage.prod.stage_name
      }
    
      alarm_description = "Elevated client errors on 'prod' stage"
    }

    生产环境最佳实践

    规划先于优化

    API-Gateways 应当契合您的架构与运营模型,而不是迫使模型去适配 Gateway。以下实践已被验证有效,并且是基于本系列第 5a 篇文章的延伸:

    分层部署:将 Foundation、平台与应用工作负载分开,这样单个 Run 保持小规模,避免配额叠加超限。

    IaC 的 Circuit-Breaker:实现 Preconditions 与 Checks,一旦错误率上升就中止 Runs,从而避免消耗其他团队的配额。

    利用时间窗口:大规模 Rollouts 应该安排在主负载窗口之外。CI 时间表是运营手段,而不是装饰。

    Provider-Timeouts 与 Retries:仅在必要时延长 Timeouts,而不是全局放宽。对于 OCI 资源,您可以在资源级别设置时间限制,例如 Deployment:


    resource "oci_apigateway_deployment" "depl" {
      # ... your configuration ...
      timeouts {
        create = "30m"
        update = "30m"
        delete = "30m"
      }
    }
    

    有意识地控制并行度:在 Terraform Enterprise 中,请在 Workspace 层面设置 TFE_PARALLELISM,而不是在命令行处到处硬编码 -parallelism Flags。这样能够避免不可控的流量高峰,并且便于审计。

    Graceful Degradation:设计可选路径,在触发 Limits 时退化为更简单的运行模式,而不是让整个 Run 失败。

    文档化配额:每个 Provider 与 Service 的 Quotas 必须集中管理。只有清楚配额的人,才能有限度地进行部署。

    Policy as Code 与 Sentinel

    Policies 用于保护平台质量。以下 Sentinel-Policy 限制每个 Run 的最大新建资源数。它可作为 Must-Have Guardrail 集成在 Terraform Enterprise 中,并在高负载时生成有价值的警告,而不是直接报错失败。


    # sentinel/policies/api_limit_guard.sentinel
    import "tfplan/v2" as tfplan
    
    max_resources_per_run = 50
    
    resources_to_create = filter tfplan.resource_changes as _, rc {
      rc.change.actions contains "create"
    }
    
    main = rule {
      length(resources_to_create) <= max_resources_per_run
    }
    
    warn_high_resource_count = rule when length(resources_to_create) > 30 {
      print("WARNING: High resource volume detected.")
      print("Consider reducing parallelism or splitting the deployment.")
      true
    }

    与 Terraform Enterprise 的集成

    在流水线中,文章 5a 中讨论的许多措施才能真正发挥效果。

    Terraform Enterprise 允许您将并行度、运行时设置以及 Gateway-Client 配置编码为组织标准。对于位于欧盟、对数据主权有要求的客户而言,TFE 是(目前唯一的)首选方案。


    terraform {
      required_version = ">= 1.10"
      required_providers {
        tfe = { source = "hashicorp/tfe", version = ">= 0.65.0" }
      }
    }
    
    provider "tfe" {
      hostname = var.tfe_hostname   # e.g., tfe.example.eu
      token    = var.tfe_token
    }
    
    resource "tfe_workspace" "prod" {
      name              = "production-infra"
      organization      = var.tfe_org
      queue_all_runs    = true    # Consider 'false' if your maturity model requires manual gates
      terraform_version = "1.10.5"
      working_directory = "live/prod"
    }
    
    resource "tfe_variable_set" "api_limits" {
      name         = "api-limit-controls"
      description  = "Controls for parallelism and API client defaults"
      organization = var.tfe_org
    }
    
    # Control Terraform parallelism via TFE_PARALLELISM
    resource "tfe_variable" "parallelism" {
      key             = "TFE_PARALLELISM"
      value           = "5"
      category        = "env"
      description     = "Terraform parallelism for API limit control"
      variable_set_id = tfe_variable_set.api_limits.id
    }
    
    # Example of passing a client header for downstream API gateway policies
    resource "tfe_variable" "client_header" {
      key             = "TF_VAR_apigw_client_header"
      value           = "X-CI-Run: ${timestamp()}"
      category        = "env"
      description     = "Example header for downstream API gateway policies"
      variable_set_id = tfe_variable_set.api_limits.id
    }

    通过 TFE_PARALLELISM 进行的控制是有文档支撑且经实践验证的。请保持保守的取值,并衡量其对 Plan-Apply- 时间的影响。

    注意:盲目提高并行度往往会因更多的 429/5xx 响应而导致性能下降。

    结论:对 API 的尊重

    API-Limits 虽然常被视为障碍,但实际上它们是一种在您的代码与平台之间的运营契约。基于 Terraform 的方法,结合清晰的 Rate-Limits、Quotas 和 Gateway 层面的告警机制,可以让 CI-Pipelines 更具可预测性,保护跨团队的资源,并显著提高 Runs 的成功率。

    在第 5a 篇文章中讨论的措施依然是首选的抓手。额外引入 API-Gateways 则可以进一步增强控制力,统一 Observability,并集中固化您的规则。

    记住:尊重 Limits,才能部署得更可持续、更稳健。