> ## Documentation Index
> Fetch the complete documentation index at: https://docs.octen.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Extract

> Extracts clean markdown content from URLs. Supports batch processing, query-focused highlights, page classification, and multimedia resources.



## OpenAPI

````yaml /api-reference/openapi.json post /extract
openapi: 3.1.0
info:
  title: Octen API
  description: >-
    Octen API provides Search, Extract, Embeddings, VL Embeddings, Web Chat,
    Broad Search, and Deep Research services. The Search API searches ranked web
    results with optional filters, highlights, and full content. The Extract API
    extracts clean markdown content from URLs, with optional query-focused
    highlights, page classification, and multimedia resources. The Embeddings
    API converts text into vector representations. The VL Embeddings API
    converts multimodal inputs (text, images, videos) into vector
    representations. The Web Chat API provides LLM chat completions with search
    augmentation. The Broad Search API automatically decomposes queries into
    multiple sub-queries for comprehensive search and synthesis. The Deep
    Research API runs a multi-round adaptive research pipeline that produces a
    structured research plan, executes iterative web searches, builds a report
    brief with evidence, and streams a final long-form report.
  version: 1.0.0
servers:
  - url: https://api.octen.ai
security:
  - apiKeyAuth: []
paths:
  /extract:
    post:
      summary: Extract
      description: >-
        Extracts clean markdown content from URLs. Supports batch processing,
        query-focused highlights, page classification, and multimedia resources.
      operationId: extract
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ExtractRequest'
            examples:
              basic:
                summary: Basic Extract
                value:
                  urls:
                    - https://docs.octen.ai/api-reference/search
                    - https://docs.octen.ai/api-reference/extract
              intentQuery:
                summary: Intent-focused Highlights with Query
                value:
                  urls:
                    - >-
                      https://www.who.int/news-room/fact-sheets/detail/influenza-(seasonal)
                  query: vaccination guidelines
              withMedia:
                summary: With Multimedia Resources
                value:
                  urls:
                    - https://example.com/article
                  include_images: true
                  include_videos: true
                  include_favicon: true
      responses:
        '200':
          description: Successful extraction response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ExtractResponse'
              example:
                code: 0
                msg: success
                request_id: req_abc123def456
                data:
                  results:
                    - url: https://docs.octen.ai/api-reference/search
                      status: success
                      title: Search - Octen
                      full_content: Octen Search API enables ranked web results...
                      highlights: null
                      time_published: '2026-01-15T00:00:00Z'
                      time_last_crawled: '2026-04-21T08:30:05Z'
                      page_structure:
                        primary: Content Page
                        secondary: Article
                      category:
                        primary: Computers, Electronics & Technology
                        secondary: Artificial Intelligence
                    - url: >-
                        https://www.who.int/news-room/fact-sheets/detail/influenza-(seasonal)
                      status: success
                      title: Influenza (Seasonal) - World Health Organization (WHO)
                      full_content: >-
                        Seasonal influenza is an acute respiratory infection
                        caused by influenza viruses...
                      highlights: null
                      time_published: '2024-10-15T00:00:00Z'
                      time_last_crawled: '2026-04-21T08:30:05Z'
                      page_structure:
                        primary: Content Page
                        secondary: Article
                      category:
                        primary: Health
                        secondary: Infectious Disease
                meta:
                  usage:
                    total_urls: 2
                    successful_urls: 2
                  latency: 1832
                  warning: null
        '400':
          description: >-
            Invalid params — Returned when a required parameter is missing or
            invalid.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
              example:
                code: 400
                msg: Invalid params. urls is required
        '401':
          $ref: '#/components/responses/Unauthorized'
        '403':
          $ref: '#/components/responses/InsufficientBalance'
        '429':
          $ref: '#/components/responses/RateLimited'
        '500':
          $ref: '#/components/responses/InternalError'
components:
  schemas:
    ExtractRequest:
      type: object
      required:
        - urls
      description: Request body for the Extract API.
      properties:
        urls:
          type: array
          items:
            type: string
          description: >-
            List of URLs to extract content from. Maximum URLs per request: 20.
            Maximum length per URL: 2048. Failed URLs are not billed.
          example:
            - https://example.com/article-1
            - https://example.com/article-2
        query:
          type: string
          maxLength: 500
          description: >-
            Intent-focused keywords. When provided, returns query-relevant
            highlights per URL; otherwise returns the complete page content.
        max_age_seconds:
          type: integer
          default: 86400
          minimum: 300
          description: >-
            Maximum age (in seconds) of cached content. URLs whose cached
            version exceeds this threshold will be re-fetched. Values outside
            the allowed range are adjusted to the nearest bound.
        format:
          type: string
          enum:
            - markdown
            - text
          default: markdown
          description: Format of the returned content.
        timeout:
          type: integer
          default: 30
          minimum: 1
          maximum: 60
          description: >-
            Per-URL extraction timeout in seconds. Values outside the allowed
            range are adjusted to the nearest bound.
        include_images:
          type: boolean
          default: false
          description: Whether to return image URLs detected on the page.
        include_videos:
          type: boolean
          default: false
          description: Whether to return video URLs detected on the page.
        include_audio:
          type: boolean
          default: false
          description: Whether to return audio URLs detected on the page.
        include_favicon:
          type: boolean
          default: false
          description: Whether to return the page's favicon URL.
    ExtractResponse:
      type: object
      properties:
        code:
          type: integer
          description: Business status code. 0 indicates success.
        msg:
          type: string
          description: A human-readable message describing the result.
        request_id:
          type: string
          description: The unique identifier for this request.
        data:
          $ref: '#/components/schemas/ExtractData'
        meta:
          $ref: '#/components/schemas/ExtractMeta'
    ErrorResponse:
      type: object
      properties:
        code:
          type: integer
          description: Business status code. Non-zero values indicate an error.
        msg:
          type: string
          description: A human-readable message describing the error.
      required:
        - code
        - msg
    ExtractData:
      type: object
      description: The main extract response payload.
      properties:
        results:
          type: array
          description: >-
            Extraction result for each requested URL. Order matches the input
            urls array.
          items:
            $ref: '#/components/schemas/ExtractResult'
    ExtractMeta:
      type: object
      description: Additional metadata for the extract request.
      properties:
        usage:
          $ref: '#/components/schemas/ExtractUsage'
        latency:
          type: integer
          description: Total request latency in milliseconds.
        warning:
          type: string
          nullable: true
          description: Optional warning message (e.g. when some URLs failed).
    ExtractResult:
      type: object
      description: >-
        A single extraction result. Batch requests may return 200 OK overall
        while individual items fail; failed items are marked with `status:
        "failed"` and an `error_message`, and are not billed.
      properties:
        url:
          type: string
          description: The requested URL.
        status:
          type: string
          enum:
            - success
            - failed
          description: Extraction status for this URL.
        title:
          type: string
          nullable: true
          description: Page title, extracted from HTML `<title>` or `<meta>` tags.
        full_content:
          type: string
          nullable: true
          description: >-
            Complete page content in the requested format. Returned when `query`
            is not provided.
        highlights:
          type: array
          nullable: true
          items:
            type: string
          description: >-
            Query-relevant snippets, sorted by relevance. Returned when `query`
            is provided.
        time_published:
          type: string
          format: date-time
          nullable: true
          description: Content publication time, ISO 8601 format.
        time_last_crawled:
          type: string
          format: date-time
          description: Most recent time Octen crawled this URL, ISO 8601 format.
        page_structure:
          allOf:
            - $ref: '#/components/schemas/ExtractPageStructure'
          description: >-
            Detected page structure. Returns `null` when the structure cannot be
            determined.
        category:
          allOf:
            - $ref: '#/components/schemas/ExtractCategory'
          description: >-
            Detected content category. Returns `null` when the category cannot
            be determined.
        images:
          type: array
          items:
            $ref: '#/components/schemas/ExtractMediaResource'
          description: >-
            Image resources detected on the page. Returned when `include_images`
            is `true` and the page contains images.
        videos:
          type: array
          items:
            $ref: '#/components/schemas/ExtractMediaResource'
          description: >-
            Video resources detected on the page. Returned when `include_videos`
            is `true` and the page contains videos.
        audio:
          type: array
          items:
            $ref: '#/components/schemas/ExtractMediaResource'
          description: >-
            Audio resources detected on the page. Returned when `include_audio`
            is `true` and the page contains audio.
        favicon:
          type: string
          nullable: true
          description: The page's favicon URL. Returned when `include_favicon` is `true`.
        error_message:
          type: string
          description: >-
            Failure reason. Only present when `status` is `failed`. See the
            Error Codes reference for the complete list of result-level error
            messages.
    ExtractUsage:
      type: object
      description: Usage and billing breakdown for the extract request.
      properties:
        total_urls:
          type: integer
          description: Total number of URLs in the request.
        successful_urls:
          type: integer
          description: >-
            Number of successfully extracted URLs (billed). Failed URLs are not
            counted.
    ExtractPageStructure:
      type: object
      description: Detected page structure for an extraction result.
      properties:
        primary:
          type: string
          description: Top-level page type.
        secondary:
          type: string
          nullable: true
          description: Sub-type within the primary structure.
    ExtractCategory:
      type: object
      description: Detected content category for an extraction result.
      properties:
        primary:
          type: string
          description: Top-level content category.
        secondary:
          type: string
          nullable: true
          description: Sub-category within the primary category.
    ExtractMediaResource:
      type: object
      description: A single multimedia resource detected on the page.
      properties:
        url:
          type: string
          description: Resource URL.
  responses:
    Unauthorized:
      description: Invalid API Key — Returned when the API key is missing or invalid.
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            code: 401
            msg: Invalid API Key
    InsufficientBalance:
      description: >-
        Insufficient balance in account — Returned when the account balance is
        insufficient to complete the request.
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            code: 403
            msg: Insufficient balance in account
    RateLimited:
      description: >-
        Exceeding the rate limit — Returned when the request exceeds the
        configured rate limit.
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            code: 429
            msg: Exceeding the rate limit
    InternalError:
      description: Internal error — Returned when an unexpected server-side error occurs.
      content:
        application/json:
          schema:
            $ref: '#/components/schemas/ErrorResponse'
          example:
            code: 500
            msg: Internal error
  securitySchemes:
    apiKeyAuth:
      type: apiKey
      in: header
      name: x-api-key
      description: >-
        API key used for request authentication. Obtain an API key before using
        the API. Note: A payment method is required to use the API.

````