openapi: 3.0.3
info:
  title: Spheron AI Marketplace — Provider Orchestrator API
  version: 1.0.0
  description: |
    # Overview

    This specification defines the API contract a compute provider's orchestrator
    must expose for the Spheron AI Marketplace to list, provision, and manage GPU
    instances on the provider's infrastructure.

    The marketplace acts as the **client** of this API. The provider implements
    the **server** side described here.

    ## Capability tiers

    | Tier | Endpoints | Required? |
    |------|-----------|-----------|
    | **Compute** | Offers, availability, instance lifecycle (create / get / list / start / stop / terminate) | **Required** — this is the minimum contract for a successful integration |
    | **Storage** | Volume locations, volume lifecycle (create / attach / detach / delete) | **Optional** — implement only if the provider supports network volumes |

    ## Integration model

    The marketplace is **poll-based**. No webhooks or callbacks are required.
    After provisioning, the marketplace polls `GET /instances/{instanceId}` every
    10–30 seconds and reacts to status transitions. Status values must therefore
    be accurate and promptly updated.

    ## Conventions

    - All requests and responses are JSON (`application/json`).
    - All prices are **USD per hour**.
    - Memory, storage, and GPU memory are expressed in **GB**.
    - Timestamps are RFC 3339 / ISO 8601 UTC strings.
    - Region identifiers are stable, human-readable tokens (e.g. `"Region 1"`,
      `"EU-North 1"`) and must be consistent across every endpoint.

  contact:
    name: Spheron AI Marketplace Integrations
    email: mitrasish@spheron.network

servers:
  - url: https://{orchestrator-host}/v1
    description: Provider orchestrator base URL (provider-hosted)
    variables:
      orchestrator-host:
        default: api.provider.example.com

security:
  - bearerAuth: []

tags:
  - name: Authentication
    description: Token issuance (only needed if static API keys are not used)
  - name: Offers
    description: Catalog of instance configurations and real-time availability
  - name: Instances
    description: Instance provisioning and lifecycle management (required)
  - name: Volumes
    description: Network volume management (optional capability tier)

paths:
  /auth/token:
    post:
      tags: [Authentication]
      operationId: issueToken
      summary: Exchange credentials for an access token (optional)
      description: |
        **Optional.** The preferred authentication mechanism is a long-lived
        static API key sent as a Bearer token on every request. Implement this
        endpoint only if the provider requires short-lived tokens. If
        implemented, the response must include `expires_in` so the marketplace
        can refresh proactively.
      security: []
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                api_key:
                  type: string
                  description: Static credential issued to the marketplace.
                username:
                  type: string
                password:
                  type: string
                  format: password
      responses:
        "200":
          description: Token issued.
          content:
            application/json:
              schema:
                type: object
                required: [access_token]
                properties:
                  access_token:
                    type: string
                  token_type:
                    type: string
                    example: Bearer
                  expires_in:
                    type: integer
                    description: Token lifetime in seconds.
                    example: 3600
        "401":
          $ref: "#/components/responses/Unauthorized"

  /configurations:
    get:
      tags: [Offers]
      operationId: listConfigurations
      summary: List all instance configurations (offers)
      description: |
        Returns **every** offer the provider knows about — both currently
        deployable and currently out of stock — with the `available` flag set
        per offer.

        **Availability contract (important):**
        - `available: false` is the canonical signal that an offer exists but
          cannot be deployed right now (spot pool sold out, capacity below
          threshold, region temporarily drained).
        - Providers **must not** express unavailability by dropping the offer
          from the array. Dropped offers silently disappear from the
          marketplace UI instead of being shown as out-of-stock with a
          notify-me option.
        - This endpoint is for catalog display. `GET /availability` is the
          deploy-time gate.
      parameters:
        - name: instance_type
          in: query
          required: false
          schema:
            $ref: "#/components/schemas/InstanceType"
          description: Optional filter by instance type (e.g. only SPOT offers).
      responses:
        "200":
          description: Full offer catalog.
          content:
            application/json:
              schema:
                type: object
                required: [configurations]
                properties:
                  configurations:
                    type: array
                    items:
                      $ref: "#/components/schemas/Configuration"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /availability:
    get:
      tags: [Offers]
      operationId: getAvailability
      summary: List currently deployable configurations (deploy-time gate)
      description: |
        Returns only offers that can be deployed **right now**
        (`available: true`). The marketplace calls this immediately before
        provisioning to avoid routing deployments into out-of-stock capacity.
        Results must reflect real-time inventory as closely as possible.
      parameters:
        - name: region
          in: query
          required: false
          schema:
            type: string
          description: Restrict results to a single region token.
          example: "EU-North 1"
      responses:
        "200":
          description: Deployable offers.
          content:
            application/json:
              schema:
                type: object
                required: [configurations]
                properties:
                  configurations:
                    type: array
                    items:
                      $ref: "#/components/schemas/Configuration"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /instances:
    get:
      tags: [Instances]
      operationId: listInstances
      summary: List instances owned by the marketplace account
      parameters:
        - name: region
          in: query
          required: false
          schema:
            type: string
          description: Optional region filter.
      responses:
        "200":
          description: Instances list.
          content:
            application/json:
              schema:
                type: object
                required: [instances]
                properties:
                  instances:
                    type: array
                    items:
                      $ref: "#/components/schemas/Instance"
        "401":
          $ref: "#/components/responses/Unauthorized"
    post:
      tags: [Instances]
      operationId: createInstance
      summary: Provision a new instance
      description: |
        Provisions an instance of the requested configuration in the requested
        region. The call should return as soon as the instance record exists
        (status `PROVISIONING`); the marketplace then polls
        `GET /instances/{instanceId}` until the instance reaches `ONLINE` or a
        failure state.

        **Provisioning time expectations:**
        - Virtual machines: `ONLINE` with a reachable SSH endpoint within
          **10 minutes** (typical target: under 5).
        - Bare metal: longer windows are acceptable if status is reported
          accurately throughout; agree on an SLA during onboarding.

        **SSH keys:** `ssh_keys` contains raw public key material. The
        orchestrator must inject these keys into the instance so the end user
        can connect. If the provider platform requires registered key objects,
        the orchestrator should create temporary keys internally and clean
        them up when the instance is terminated.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/CreateInstanceRequest"
      responses:
        "201":
          description: Instance record created; provisioning has started.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Instance"
        "400":
          $ref: "#/components/responses/BadRequest"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "409":
          description: Requested capacity is no longer available.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"

  /instances/{instanceId}:
    parameters:
      - $ref: "#/components/parameters/instanceId"
      - $ref: "#/components/parameters/regionQuery"
    get:
      tags: [Instances]
      operationId: getInstance
      summary: Get instance state (polled every 10–30 seconds)
      description: |
        Primary monitoring endpoint. The marketplace polls this continuously
        for every active instance, so it must be fast, cheap, and accurate.

        **Termination convention:** once an instance has been destroyed and no
        longer exists, this endpoint should return `404`. The marketplace
        treats `404` on a previously known instance as status `DESTROYED`.
      responses:
        "200":
          description: Current instance state.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Instance"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
    delete:
      tags: [Instances]
      operationId: destroyInstance
      summary: Terminate an instance
      description: |
        Permanently destroys the instance and releases its resources,
        including any temporary provider-side resources created during
        provisioning (registered SSH keys, startup scripts). Idempotent:
        deleting an already-destroyed instance returns `404` or a terminal
        `DESTROYED` state — both are treated as success.
      responses:
        "200":
          description: Destruction initiated or completed.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Instance"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

  /instances/{instanceId}/start:
    post:
      tags: [Instances]
      operationId: startInstance
      summary: Start (resume) a stopped instance
      description: |
        Resumes a `STOPPED` instance in place, preserving its disk and,
        where the platform allows, its IP address. If the provider cannot
        resume stopped instances, return `409` with a descriptive error.
      parameters:
        - $ref: "#/components/parameters/instanceId"
        - $ref: "#/components/parameters/regionQuery"
      responses:
        "200":
          description: Start initiated.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Instance"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
        "409":
          description: Instance is not in a startable state.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"

  /instances/{instanceId}/stop:
    post:
      tags: [Instances]
      operationId: stopInstance
      summary: Stop (pause) a running instance
      description: |
        Stops a running instance while preserving its disk. The instance
        remains owned by the deployment and should be resumable via
        `/start`. Billing for compute should pause while stopped (storage
        charges, if any, should be documented during onboarding).
      parameters:
        - $ref: "#/components/parameters/instanceId"
        - $ref: "#/components/parameters/regionQuery"
      responses:
        "200":
          description: Stop initiated.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Instance"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
        "409":
          description: Instance is not in a stoppable state.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"

  /volumes/locations:
    get:
      tags: [Volumes]
      operationId: listVolumeLocations
      summary: List regions where volumes can be created (optional tier)
      description: |
        **Optional capability tier.** Returns the regions in which network
        volumes can be created. Each `id` **must equal** the region token used
        in `Configuration.regions` for the same region, so the marketplace can
        match volumes to deployable capacity.
      responses:
        "200":
          description: Volume locations.
          content:
            application/json:
              schema:
                type: object
                required: [locations]
                properties:
                  locations:
                    type: array
                    items:
                      $ref: "#/components/schemas/VolumeLocation"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /volumes:
    post:
      tags: [Volumes]
      operationId: createVolume
      summary: Create a network volume (optional tier)
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/CreateVolumeRequest"
      responses:
        "201":
          description: Volume created.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Volume"
        "400":
          $ref: "#/components/responses/BadRequest"
        "401":
          $ref: "#/components/responses/Unauthorized"

  /volumes/{volumeId}:
    parameters:
      - $ref: "#/components/parameters/volumeId"
    get:
      tags: [Volumes]
      operationId: getVolume
      summary: Get volume state (optional tier)
      responses:
        "200":
          description: Volume state.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Volume"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
    delete:
      tags: [Volumes]
      operationId: deleteVolume
      summary: Delete a volume (optional tier)
      description: |
        Deletes the volume. If attachment records can linger after detach,
        the orchestrator should force-detach internally before deletion
        rather than failing. Deleting an already-deleted volume returns
        `404`, which the marketplace treats as success.
      responses:
        "204":
          description: Volume deleted.
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"
        "409":
          description: Volume is attached and cannot be deleted.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"

  /volumes/{volumeId}/attach:
    post:
      tags: [Volumes]
      operationId: attachVolume
      summary: Attach a volume to one or more instances (optional tier)
      description: |
        Attaching to an already-attached instance should be treated as
        success (idempotent). If the platform requires a stop → attach →
        start cycle, the orchestrator must handle it internally and report
        intermediate instance states accurately.
      parameters:
        - $ref: "#/components/parameters/volumeId"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required: [instance_ids]
              properties:
                instance_ids:
                  type: array
                  items:
                    type: string
                  minItems: 1
      responses:
        "200":
          description: Attachment initiated or completed.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Volume"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

  /volumes/{volumeId}/detach:
    post:
      tags: [Volumes]
      operationId: detachVolume
      summary: Detach a volume from instances (optional tier)
      description: |
        Detaches the volume from the listed instances (or all instances if
        `instance_ids` is omitted). If detachment is asynchronous on the
        platform, the orchestrator should not report the volume as detached
        until the attachment record is actually gone.
      parameters:
        - $ref: "#/components/parameters/volumeId"
      requestBody:
        required: false
        content:
          application/json:
            schema:
              type: object
              properties:
                instance_ids:
                  type: array
                  items:
                    type: string
      responses:
        "200":
          description: Detachment initiated or completed.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Volume"
        "401":
          $ref: "#/components/responses/Unauthorized"
        "404":
          $ref: "#/components/responses/NotFound"

components:
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      description: |
        Static API key (or token from `/auth/token`) sent as
        `Authorization: Bearer <key>`. Keys must be scoped to the marketplace
        account and revocable. Multiple active keys should be supported for
        zero-downtime rotation.

  parameters:
    instanceId:
      name: instanceId
      in: path
      required: true
      schema:
        type: string
      description: Provider-assigned instance identifier.
    volumeId:
      name: volumeId
      in: path
      required: true
      schema:
        type: string
      description: Provider-assigned volume identifier.
    regionQuery:
      name: region
      in: query
      required: false
      schema:
        type: string
      description: |
        Region token of the instance. The marketplace always sends this;
        orchestrators with globally unique instance IDs may ignore it.

  responses:
    Unauthorized:
      description: Missing or invalid credentials.
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
    BadRequest:
      description: Malformed or invalid request.
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"
    NotFound:
      description: |
        Resource does not exist. For instances, the marketplace interprets
        404 on a previously known instance as terminal (`DESTROYED`).
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/Error"

  schemas:
    InstanceType:
      type: string
      description: Commercial class of the offer.
      enum: [SPOT, DEDICATED, BARE_METAL, CLUSTER]

    InstanceStatus:
      type: string
      description: |
        Canonical instance lifecycle status. Map internal platform states to
        these values:

        - `PROVISIONING` — being created; not yet usable.
        - `ONLINE` — running and reachable; SSH endpoint live.
        - `STOPPING` / `STOPPED` — paused; resource still exists and is owned
          by the deployment, resumable via `/start`.
        - `OFFLINE` — no longer running and **cannot** be resumed in place
          (e.g. the platform has reaped a shut-off/hibernated VM). Terminal.
        - `DESTROYING` / `DESTROYED` — being / has been deleted. Terminal.
        - `ERROR` — non-recoverable provisioning or runtime error. Terminal.
      enum:
        - PROVISIONING
        - ONLINE
        - STOPPING
        - STOPPED
        - OFFLINE
        - DESTROYING
        - DESTROYED
        - ERROR

    Configuration:
      type: object
      description: A purchasable instance configuration (offer / SKU).
      required:
        - id
        - name
        - vcpus
        - memory_gb
        - storage_gb
        - price_per_hour
        - available
        - regions
        - instance_type
        - supports_cloud_init
      properties:
        id:
          type: string
          description: Stable unique identifier for this offer.
          example: gpu-8x-h100-sxm
        name:
          type: string
          description: Human-readable offer name.
          example: 8× H100 SXM
        description:
          type: string
          example: 8× H100 SXM5 80GB, NVLink, 192 vCPU, 2TB RAM
        instance_type:
          $ref: "#/components/schemas/InstanceType"
        vcpus:
          type: integer
          example: 192
        memory_gb:
          type: number
          description: System memory in GB.
          example: 2048
        storage_gb:
          type: number
          description: Included local/root storage in GB.
          example: 8000
        gpu_count:
          type: integer
          example: 8
        gpu_type:
          type: string
          description: GPU model identifier, consistent across offers.
          example: H100-SXM5-80GB
        gpu_memory_gb:
          type: number
          description: Memory per GPU in GB.
          example: 80
        price_per_hour:
          type: number
          description: On-demand price in USD per hour.
          example: 21.52
        spot_price_per_hour:
          type: number
          description: Spot price in USD per hour, if the offer supports spot.
          example: 10.76
        available:
          type: boolean
          description: |
            Whether this offer is deployable right now. Out-of-stock offers
            MUST still be returned from `/configurations` with
            `available: false` — never dropped from the array.
        regions:
          type: array
          description: |
            Canonical region tokens where this offer exists. Each token must
            be a stable, human-readable identifier (e.g. `"EU-North 1"`) used
            consistently across `/configurations`, `/availability`,
            `/instances`, and `/volumes/locations`. Raw internal datacenter
            codes should be translated to these tokens at the API boundary.
          items:
            type: string
          example: ["EU-North 1", "US-Central 1"]
        os_options:
          type: array
          description: Operating system images deployable on this offer.
          items:
            type: string
          example: ["ubuntu-22.04-cuda-12.4", "ubuntu-24.04"]
        supports_cloud_init:
          type: boolean
          description: Whether `cloud_init` is honored at instance creation.
        maintenance:
          type: boolean
          description: Offer/region temporarily under maintenance.
        extras:
          type: object
          description: Provider-specific metadata (e.g. add-on pricing).
          additionalProperties: true

    CreateInstanceRequest:
      type: object
      required:
        - configuration_id
        - region
        - ssh_keys
      properties:
        configuration_id:
          type: string
          description: '`Configuration.id` of the offer to deploy.'
          example: gpu-8x-h100-sxm
        region:
          type: string
          description: Canonical region token to deploy into.
          example: "EU-North 1"
        name:
          type: string
          description: |
            Desired instance name. If names must be unique on the platform,
            the orchestrator should de-duplicate (e.g. suffix) rather than
            fail.
          example: spheron-d3f9a1
        ssh_keys:
          type: array
          description: |
            Raw SSH public key material (OpenSSH format) to inject into the
            instance for the end user.
          items:
            type: string
          minItems: 1
          example:
            - ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAA... user@host
        operating_system_image:
          type: string
          description: One of the offer's `os_options`. Provider default if omitted.
          example: ubuntu-22.04-cuda-12.4
        root_disk_size_gb:
          type: number
          description: Override root disk size, where supported.
          example: 1000
        cloud_init:
          $ref: "#/components/schemas/CloudInitConfig"
        volume_ids:
          type: array
          description: |
            **Optional tier.** Volumes to attach at create time. Orchestrators
            that support inline attachment should honor this and skip any
            post-create stop → attach → start cycle. Orchestrators that don't
            may ignore it; the marketplace will attach via `/volumes/{id}/attach`
            once the instance is `ONLINE`.
          items:
            type: string

    CloudInitConfig:
      type: object
      description: |
        Structured cloud-init directives. Only honored when the offer reports
        `supports_cloud_init: true`.
      properties:
        runcmd:
          type: array
          description: Shell commands to run on first boot.
          items:
            type: string
        packages:
          type: array
          description: Packages to install on first boot.
          items:
            type: string
        package_update:
          type: boolean
          description: Run package index update on first boot.
        package_upgrade:
          type: boolean
          description: Run package upgrade on first boot.
        write_files:
          type: array
          items:
            $ref: "#/components/schemas/CloudInitFile"

    CloudInitFile:
      type: object
      required: [path, content]
      properties:
        path:
          type: string
          example: /etc/myapp/config.yaml
        content:
          type: string
          description: Plain-text file content.
        owner:
          type: string
          example: root:root
        permissions:
          type: string
          example: "0644"

    PortForward:
      type: object
      description: |
        Port mapping for platforms that NAT instances behind shared IPs.
        Not needed when instances receive directly reachable public IPs.
      required: [internal_port, external_port]
      properties:
        internal_port:
          type: integer
          example: 22
        external_port:
          type: integer
          example: 30022

    Instance:
      type: object
      required:
        - id
        - configuration_id
        - region
        - status
        - created_at
        - updated_at
      properties:
        id:
          type: string
          description: Provider-assigned unique instance identifier.
        name:
          type: string
        configuration_id:
          type: string
          description: '`Configuration.id` this instance was deployed from.'
        region:
          type: string
          description: Canonical region token.
          example: "EU-North 1"
        status:
          $ref: "#/components/schemas/InstanceStatus"
        public_ip:
          type: string
          description: |
            Publicly reachable IP. Required (alongside SSH access) for the
            instance to be considered `ONLINE`.
          example: 203.0.113.10
        private_ip:
          type: string
          example: 10.0.4.21
        ssh_port:
          type: integer
          description: External SSH port (22 unless NAT/port-forwarding is used).
          default: 22
        ssh_username:
          type: string
          description: Login user for the injected SSH keys.
          example: ubuntu
        port_forwards:
          type: array
          items:
            $ref: "#/components/schemas/PortForward"
        vcpus:
          type: integer
        memory_gb:
          type: number
        storage_gb:
          type: number
        gpu_count:
          type: integer
        gpu_type:
          type: string
        image:
          type: string
          description: Operating system image deployed.
        price_per_hour:
          type: number
          description: |
            Current effective price in USD per hour. Required for
            dynamically priced (e.g. spot) instances.
        error:
          type: string
          description: Human-readable detail when `status` is `ERROR`.
        created_at:
          type: string
          format: date-time
        updated_at:
          type: string
          format: date-time

    VolumeLocation:
      type: object
      required: [id, name]
      properties:
        id:
          type: string
          description: |
            Canonical region token — MUST equal the matching entry in
            `Configuration.regions`.
          example: "EU-North 1"
        name:
          type: string
          example: "EU North (Region 1)"
        types:
          type: array
          description: Volume types offered in this region.
          items:
            type: string
          example: ["ssd", "nvme-shared"]
        country_code:
          type: string
          example: FI

    CreateVolumeRequest:
      type: object
      required: [name, size_gb, region]
      properties:
        name:
          type: string
          example: spheron-vol-7d2c
        size_gb:
          type: number
          example: 500
        region:
          type: string
          description: Canonical region token from `/volumes/locations`.
          example: "EU-North 1"
        type:
          type: string
          description: Volume type from `VolumeLocation.types`.
          example: ssd
        instance_ids:
          type: array
          description: Optionally attach to these instances at create time.
          items:
            type: string

    Volume:
      type: object
      required: [id, name, size_gb, region, status]
      properties:
        id:
          type: string
        name:
          type: string
        size_gb:
          type: number
        region:
          type: string
          description: Canonical region token.
        type:
          type: string
        status:
          type: string
          description: Volume lifecycle status.
          enum: [CREATING, AVAILABLE, ATTACHING, ATTACHED, DETACHING, DELETING, ERROR]
        attached_instance_ids:
          type: array
          description: Instances this volume is currently attached to.
          items:
            type: string
        price_per_hour:
          type: number
          description: Storage price in USD per hour, if billed hourly.
        created_at:
          type: string
          format: date-time

    Error:
      type: object
      required: [error]
      properties:
        error:
          type: object
          required: [code, message]
          properties:
            code:
              type: string
              description: Stable machine-readable error code.
              example: capacity_unavailable
            message:
              type: string
              description: Human-readable explanation.
              example: No capacity for gpu-8x-h100-sxm in EU-North 1.
            details:
              type: object
              additionalProperties: true
