# Crawl

Capture screenshots and crawl web pages using a headless browser.

## YAML Parameters

```typescript
interface YAMLParameters {
  /** The name of the crawl suite to run. */
  crawl_suite: string;
  /** The URL of the page to crawl. */
  url: string;
  type: "CRAWL";
  /** Unique identifier for the action within the pipeline. */
  action: string;
  /** Crawl all subpages of the page. Default: false */
  follow_links?: boolean;
  /** Respect robots.txt. Default: false */
  respect_robots?: boolean;
  /** The list of screenshot output configurations. */
  outputs?: OutputYaml[];
  /** The list of browsers to use for crawling. */
  browsers?: string[];
  /** The color scheme to use (e.g., 'light', 'dark'). */
  color_scheme?: string;
  /** Cookies to set when visiting the URL. */
  cookies?: string[];
  /** HTTP headers to include in requests. */
  headers?: string[];
  /** Add delay in milliseconds before taking screenshot (format: [scope::]milliseconds, e.g., "1000" or "example.com::2000") */
  delays?: string[];
  /** Wait for elements to appear before taking screenshot. */
  wait_for_elements?: string[];
  /** Local storage key-value pairs to set before crawling. */
  local_storage?: string[];
  /** The list of device configurations to emulate. */
  devices?: DeviceYaml[];
  /** Directory where crawled outputs will be stored. */
  output_directory?: string;
  /** The directory in which the pipeline filesystem will be mounted. */
  working_directory?: string;
  /** The commands that will be executed. */
  execute_commands?: string[];
  /** The command that will be executed only on the first run. */
  setup_commands?: string[];
  /** The dependencies and directories to be cached and available to every execution in this pipeline. */
  cached_dirs?: string[];
  /** The name of the shell that will be used to execute commands. */
  shell?: "SH" | "BASH" | "POWERSHELL";
  /** If set to true all commands will be executed regardless of the result of the previous command. */
  execute_every_command?: boolean;
  /** If set to true, the filesystem will not be mounted in the container. */
  mount_filesystem_disable?: boolean;
  /** The path preceding the colon is the filesystem path (the folder from the filesystem to be mounted in the container). The path after the colon is the container path (the path in the container, where this filesystem will be located). */
  volume_mappings?: string[];
  /** The hostname of the container in which the action is run. The container will be available under this name in the docker network for services. */
  main_service_name?: string;
  /** All build commands are run as the default user defined in the selected Docker image. Can be set to another username (on the condition that this user exists in the selected image). */
  run_as_user?: string;
  /** The entrypoint to use for the Docker container. */
  entrypoint?: string;
  /** If set to true, resets the default entrypoint set by the image. */
  reset_entrypoint?: boolean;
  /** If set to true, the cached version of the image is used, instead of being pulled each time. */
  cache_base_image?: boolean;
  /** If set to true, use cached image on timeouts (only for official images). */
  ignore_image_pull_failures?: boolean;
  /** The path in the container to export. */
  export_container_path?: string;
  /** The name of the Docker image. */
  docker_image_name?: string;
  /** The tag of the Docker image. */
  docker_image_tag?: string;
  /** The location of the image used by the action. */
  image_location?: "PUBLIC_REGISTRY" | "PRIVATE_REGISTRY" | "ACTION" | "ARTIFACT_REGISTRY";
  /** The type of registry from which the image is retrieved. */
  docker_registry?: "NONE" | "DOCKER_HUB" | "AMAZON_ECR" | "GOOGLE_GCR" | "GOOGLE_ARTIFACT_REGISTRY" | "OTHER" | "GIT_HUB_CONTAINER_REGISTRY" | "ARTIFACT_REGISTRY" | "DIGITAL_OCEAN_CONTAINER_REGISTRY";
  /** The identifier of the artifact from artifact registry. */
  artifact?: string;
  /** The version of the artifact from artifact registry. */
  version?: string;
  /** If set to true, the image from another action will be used. */
  use_image_from_action?: boolean;
  /** The name of the action from which the Docker image will be used. */
  docker_build_action_name?: string;
  /** The URL of the Docker registry. */
  registry?: string;
  /** If set to true, the registry will be accessed over HTTP instead of HTTPS. */
  insecure_registry?: boolean;
  /** The target stage of the Dockerfile. */
  target_stage?: string;
  /** If set to true, images will not be pruned after the build. */
  do_not_prune_images?: boolean;
  /** The username for the Docker registry. */
  login?: string;
  /** The password for the Docker registry. */
  password?: string;
  /** The list of services attached to the build environment. */
  services?: ServiceYaml[];
  /** The application/project ID (e.g., Firebase project ID, Google Cloud project ID) */
  application_id?: string;
  /** The name of the visual test suite to run tests against. */
  vt_suite?: string;
  /** The list of target servers or clusters. */
  targets?: object[];
  /** Specifies when the action should be executed. */
  trigger_time?: "ON_EVERY_EXECUTION" | "ON_SUCCESS" | "ON_FAILURE" | "ON_BACK_TO_SUCCESS" | "ON_WARNING" | "ON_WAIT_FOR_APPROVE" | "ON_TERMINATE";
  /** The list of variables for dynamic action execution. The action runs once for each value. */
  loop?: string[];
  /** Defines whether the action should be executed on each failure. Restricted to and required if the trigger_time is ON_FAILURE. */
  run_only_on_first_failure?: boolean;
  /** When set to true the action is disabled. By default it is set to false. */
  disabled?: boolean;
  /** The timeout in seconds. */
  timeout?: number;
  /** If set to true the execution will proceed, mark action as a warning and jump to the next action. Doesn't apply to deployment actions. */
  ignore_errors?: boolean;
  /** Delay time between auto retries in seconds. */
  retry_interval?: number;
  /** Number of retries if the action fails. */
  retry_count?: number;
  /** Defines whether the action should run in parallel with the next one. */
  run_next?: "WAIT_ON_SUCCESS" | "IN_SOFT_PARALLEL" | "IN_HARD_PARALLEL";
  /** The list of trigger conditions to meet so that the action can be triggered. */
  trigger_conditions?: TriggerConditionYaml[];
  /** The list of variables you can use in the action. */
  variables?: VariableYaml[];
}
```

## Type Definitions

```typescript
interface OutputYaml {
  type?: string;
  selector?: string;
  quality?: number;
  full_page?: boolean;
}

interface DeviceYaml {
  name?: string;
  viewport?: string;
  screen?: string;
  device_pixel_ratio?: number;
  is_mobile?: boolean;
}

interface ServiceYaml {
  /** Service type (e.g., MYSQL, POSTGRES, REDIS, MONGO_DB) */
  type?: string;
  /** Service version */
  version?: string;
  /** Path to database dump file */
  dump_path?: string;
  /** When true, data persists between executions */
  persistent?: boolean;
  /** Connection configuration for the service */
  connection?: ServiceConnectionYaml;
  /** Gas price for blockchain services */
  gas_price?: number;
  /** Gas limit for blockchain services */
  gas_limit?: number;
  /** List of accounts for blockchain services */
  accounts?: string[];
  /** Custom Docker image name */
  docker_image_name?: string;
  /** Docker image tag */
  docker_image_tag?: string;
  /** Registry URL */
  registry?: string;
  /** Registry login username */
  login?: string;
  /** Registry password */
  password?: string;
  /** Location of the Docker image */
  image_location?: "PUBLIC_REGISTRY" | "PRIVATE_REGISTRY" | "ACTION" | "ARTIFACT_REGISTRY";
  /** Docker registry type */
  docker_registry?: "NONE" | "DOCKER_HUB" | "AMAZON_ECR" | "GOOGLE_GCR" | "GOOGLE_ARTIFACT_REGISTRY" | "OTHER" | "GIT_HUB_CONTAINER_REGISTRY" | "ARTIFACT_REGISTRY" | "DIGITAL_OCEAN_CONTAINER_REGISTRY";
  /** When true, use image from another action */
  use_image_from_action?: boolean;
  /** ID of Docker build action to use image from */
  docker_build_action_id?: number;
  /** Name of Docker build action to use image from */
  docker_build_action_name?: string;
  /** Working directory in the container */
  working_directory?: string;
  /** Volume mappings for the container */
  volume_mappings?: string[];
  /** Container entrypoint command */
  entrypoint?: string;
  /** Commands to run in the container */
  inline_commands?: string;
  /** Registry region */
  region?: string;
  /** Port to wait for before proceeding */
  wait_for_port?: number;
  /** Integration for registry authentication */
  integration?: string;
  /** Directories to cache between executions */
  cached_dirs?: string[];
  /** When true, pass environment variables to the container */
  pass_variables?: boolean;
  /** User to run the container as */
  run_as_user?: string;
  /** When true, ignore image pull failures */
  ignore_image_pull_failures?: boolean;
  /** When true, cache the base Docker image */
  cache_base_image?: boolean;
}

interface TriggerConditionYaml {
  /** The type of trigger condition */
  trigger_condition: "ALWAYS" | "ON_CHANGE" | "ON_CHANGE_AT_PATH" | "VAR_IS" | "VAR_IS_NOT" | "VAR_CONTAINS" | "VAR_NOT_CONTAINS" | "DATETIME" | "SUCCESS_PIPELINE" | "DAY" | "HOUR" | "OR" | "VAR_LESS_THAN" | "VAR_LESS_THAN_OR_EQUAL" | "VAR_GREATER_THAN" | "VAR_GREATER_THAN_OR_EQUAL" | "ACTION_STATUS_IS" | "ACTION_STATUS_IS_NOT" | "TRIGGERING_USER_IS" | "TRIGGERING_USER_IS_NOT" | "TRIGGERING_USER_IS_IN_GROUP" | "TRIGGERING_USER_IS_NOT_IN_GROUP";
  /** The value to compare the trigger variable against */
  trigger_variable_value?: string;
  /** The name of the variable to check in the trigger condition */
  trigger_variable_key?: string;
  /** The timezone for datetime trigger conditions (e.g., 'UTC', 'Europe/Warsaw') */
  timezone?: string;
  /** The hours when the datetime trigger should activate (0-23) */
  trigger_hours?: number[];
  /** The days when the datetime trigger should activate (1-7, where 1 is Monday) */
  trigger_days?: number[];
  /** The project name for cross-project pipeline triggers */
  project?: string;
  /** The pipeline name for cross-pipeline triggers */
  pipeline?: string;
  /** The email of the user who can trigger the pipeline */
  trigger_user?: string;
  /** The name of the group that can trigger the pipeline */
  trigger_group?: string;
  /** The file paths that must change to trigger the pipeline */
  trigger_condition_paths?: string[];
  /** The action status to check for action status triggers */
  trigger_status?: "SUCCESSFUL" | "FAILED" | "SKIPPED" | "SUPPRESSED";
  /** The name of the action to check status for */
  trigger_action_name?: string;
  /** The list of nested trigger conditions for OR/AND operators */
  trigger_operands?: TriggerConditionYaml[];
}

interface VariableYaml {
  /** The name of the variable */
  key: string;
  /** The value of the variable */
  value?: string;
  /** The type of the added variable */
  type?: "VAR" | "FILE" | "SSH_KEY" | "IOS_KEYCHAIN" | "IOS_PROVISION_PROFILES" | "SSH_PUBLIC_KEY" | "GPG_KEY";
  /** If set to true the variable value will be encrypted and hidden */
  encrypted?: boolean;
  /** The optional description of the variable */
  description?: string;
  /** Initial path for the variable */
  init_path?: string;
  /** Default value for the variable */
  defaults?: string;
  /** Set if type is FILE, SSH_KEY, IOS_KEYCHAIN, or IOS_PROVISION_PROFILES. If it's NONE, the variable can be used as a parameter in an action. For CONTAINER, the given key is additionally copied to an action container on each run */
  file_place?: "NONE" | "CONTAINER";
  /** Whether the file is binary */
  binary?: boolean;
  /** Public value for SSH key type variables */
  public_value?: string;
  /** Fingerprint of SSH key */
  key_fingerprint?: string;
  /** Checksum of the variable value */
  checksum?: string;
  /** Password for certificates */
  password?: string;
  /** Passphrase for encrypted SSH keys */
  passphrase?: string;
  /** Key identifier for iOS certificates, provisioning profiles, or GPG keys */
  key_identifier?: string;
  /** If set to true the variable value can be set by Buddy actions */
  settable?: string;
  /** Encoding of the variable value. Use `b64` for binary files (certificates, images, compiled blobs) where the value is already base64-encoded. Omit or set to `text` for plain text files (JSON, XML, config) — the system will handle encoding automatically. Only applies to non-encrypted asset variables (FILE, SSH_KEY, SSH_PUBLIC_KEY, IOS_KEYCHAIN, IOS_PROVISION_PROFILES). */
  encoding?: "text" | "b64";
  /** Specifies where to copy the file on each run. Set if type is FILE, SSH_KEY, IOS_KEYCHAIN, or IOS_PROVISION_PROFILES. */
  path?: string;
  /** File permission set on copy to a container on each run. Set if type is FILE, SSH_KEY, IOS_KEYCHAIN, or IOS_PROVISION_PROFILES. */
  chmod?: string;
}

```

## YAML Examples

### Crawl

```yaml
  - action: "Crawl"
    type: "CRAWL"
    trigger_time: "ON_EVERY_EXECUTION"
    crawl_suite: "my-crawl-suite"
    url: "https://example.com"
    follow_links: true
    respect_robots: true
    outputs:
      - type: "PNG"
        quality: 90
        full_page: true
    browsers:
      - "chromium"
    color_scheme: "dark"
    cookies:
      - "session=abc123"
    headers:
      - "Authorization: Bearer token"
    delays:
      - "1000"
    wait_for_elements:
      - "#main-content"
    local_storage:
      - "key=value"
    output_directory: "/crawl-output"

```


---
Original source: https://buddy.works/docs/yaml/yaml-actions/crawl