openapi: 3.0.3
info:
title: 'Scrapii API Documentation'
description: "Scrapii is a secure PII redaction and tokenization API built for privacy-first AI workflows. It detects and replaces sensitive data with non-sensitive tokens, allowing documents to be safely processed by public AI services and frontier LLMs without exposing regulated or personal information.\n
\nScrapii supports authorized detokenization, enabling clients to securely restore original values when required for downstream workflows, audits, or human review. For maximum trust and data sovereignty, Scrapii also supports client-controlled token encryption, ensuring sensitive values can only be decrypted by the client—never the platform.\n
\nDesigned for developers, Scrapii integrates seamlessly into automated pipelines via REST APIs, n8n workflows, or any existing processing pipeline, enabling low-friction redaction and de-identification across text, PDFs, and images. Deterministic token mapping preserves referential integrity while maintaining strict data boundaries.\n
\nScrapii is ideal for compliance-driven industries such as healthcare, legal, and finance, helping teams meet GDPR, HIPAA, SOC 2, and PIPEDA requirements while still leveraging modern AI, document processing, and automation tools."
version: 1.0.0
servers:
-
url: 'https://scrapii.net'
tags:
-
name: Endpoints
description: ''
components:
securitySchemes:
default:
type: http
scheme: bearer
description: 'You can create API keys by visiting your dashboard and clicking Generate API Key. Include the API key in the Authorization header as: Bearer {apikey}'
security:
-
default: []
paths:
/api/v1/test:
get:
summary: ''
operationId: getApiV1Test
description: ''
parameters: []
responses:
401:
description: ''
content:
application/json:
schema:
type: object
example:
message: Unauthenticated.
properties:
message:
type: string
example: Unauthenticated.
tags:
- Endpoints
/api/v1/document:
post:
summary: 'Upload and tokenize a document.'
operationId: uploadAndTokenizeADocument
description: 'Upload a document for PII detection and tokenization.'
parameters: []
responses:
201:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Document uploaded successfully.'
data:
id: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
user_id: 1
mode: token
status: queued
original_filename: example.pdf
confidence_threshold: 0.85
entity_types:
- EMAIL_ADDRESS
- PHONE_NUMBER
progress_percentage: 0
current_step: 'Queued for processing'
created_at: '2024-01-01T00:00:00.000000Z'
updated_at: '2024-01-01T00:00:00.000000Z'
properties:
message:
type: string
example: 'Document uploaded successfully.'
data:
type: object
properties:
id:
type: string
example: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
user_id:
type: integer
example: 1
mode:
type: string
example: token
status:
type: string
example: queued
original_filename:
type: string
example: example.pdf
confidence_threshold:
type: number
example: 0.85
entity_types:
type: array
example:
- EMAIL_ADDRESS
- PHONE_NUMBER
items:
type: string
progress_percentage:
type: integer
example: 0
current_step:
type: string
example: 'Queued for processing'
created_at:
type: string
example: '2024-01-01T00:00:00.000000Z'
updated_at:
type: string
example: '2024-01-01T00:00:00.000000Z'
tags:
- Endpoints
requestBody:
required: true
content:
multipart/form-data:
schema:
type: object
properties:
file:
type: string
format: binary
description: 'The document file to process. Maximum size: 10 MB.'
confidence_threshold:
type: number
description: 'Minimum confidence score (0.0-1.0) for PII detection.'
example: 0.85
nullable: true
entity_types:
type: array
description: 'Optional array of specific PII entity types to detect.'
example:
- EMAIL_ADDRESS
- PHONE_NUMBER
items:
type: string
required:
- file
'/api/v1/document/{document_id}':
get:
summary: 'Get the tokenized document content.'
operationId: getTheTokenizedDocumentContent
description: ''
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Document content retrieved successfully.'
data:
content: 'Contact [TOKEN:EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e] for more information.'
document_id: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
original_filename: example.pdf
properties:
message:
type: string
example: 'Document content retrieved successfully.'
data:
type: object
properties:
content:
type: string
example: 'Contact [TOKEN:EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e] for more information.'
document_id:
type: string
example: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
original_filename:
type: string
example: example.pdf
403:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'You do not have permission to access this document.'
properties:
message:
type: string
example: 'You do not have permission to access this document.'
404:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Processed document file not found.'
properties:
message:
type: string
example: 'Processed document file not found.'
409:
description: ''
content:
application/json:
schema:
oneOf:
-
description: 'Not Ready'
type: object
example:
message: 'Document is not ready for download.'
properties:
message:
type: string
example: 'Document is not ready for download.'
-
description: Failed
type: object
example:
message: 'Document processing failed and cannot be downloaded.'
properties:
message:
type: string
example: 'Document processing failed and cannot be downloaded.'
tags:
- Endpoints
parameters:
-
in: path
name: document_id
description: 'The ID of the document.'
example: 019c4ece-4c65-70c9-9b0b-09c861785a85
required: true
schema:
type: string
'/api/v1/document/{document_id}/status':
get:
summary: 'Get document status and details.'
operationId: getDocumentStatusAndDetails
description: ''
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
data:
id: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
user_id: 1
mode: token
status: completed
original_filename: example.pdf
confidence_threshold: 0.85
entity_types:
- EMAIL_ADDRESS
- PHONE_NUMBER
progress_percentage: 100
current_step: 'Processing complete'
created_at: '2024-01-01T00:00:00.000000Z'
updated_at: '2024-01-01T00:00:01.000000Z'
properties:
data:
type: object
properties:
id:
type: string
example: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
user_id:
type: integer
example: 1
mode:
type: string
example: token
status:
type: string
example: completed
original_filename:
type: string
example: example.pdf
confidence_threshold:
type: number
example: 0.85
entity_types:
type: array
example:
- EMAIL_ADDRESS
- PHONE_NUMBER
items:
type: string
progress_percentage:
type: integer
example: 100
current_step:
type: string
example: 'Processing complete'
created_at:
type: string
example: '2024-01-01T00:00:00.000000Z'
updated_at:
type: string
example: '2024-01-01T00:00:01.000000Z'
403:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'You do not have permission to access this document.'
properties:
message:
type: string
example: 'You do not have permission to access this document.'
tags:
- Endpoints
parameters:
-
in: path
name: document_id
description: 'The ID of the document.'
example: 019c4ece-4c65-70c9-9b0b-09c861785a85
required: true
schema:
type: string
'/api/v1/document/{document_id}/detokenize':
get:
summary: 'Detokenize a processed document and return the original content.'
operationId: detokenizeAProcessedDocumentAndReturnTheOriginalContent
description: ''
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Document detokenized successfully.'
data:
content: 'Contact john.doe@example.com for more information.'
document_id: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
original_filename: example.pdf
tokens_found:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
tokens_replaced:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
tokens_failed: []
total_detokenizations: 1
properties:
message:
type: string
example: 'Document detokenized successfully.'
data:
type: object
properties:
content:
type: string
example: 'Contact john.doe@example.com for more information.'
document_id:
type: string
example: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
original_filename:
type: string
example: example.pdf
tokens_found:
type: array
example:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
items:
type: string
tokens_replaced:
type: array
example:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
items:
type: string
tokens_failed:
type: array
example: []
total_detokenizations:
type: integer
example: 1
400:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Document is not tokenized'
properties:
message:
type: string
example: 'Document is not tokenized'
403:
description: ''
content:
application/json:
schema:
type: object
example:
message: Unauthorized
properties:
message:
type: string
example: Unauthorized
tags:
- Endpoints
parameters:
-
in: path
name: document_id
description: 'The ID of the document.'
example: 019c4ece-4c65-70c9-9b0b-09c861785a85
required: true
schema:
type: string
/api/v1/documents:
post:
summary: 'Download multiple tokenized documents as a single combined file.'
operationId: downloadMultipleTokenizedDocumentsAsASingleCombinedFile
description: ''
parameters: []
responses:
200:
description: Success
content:
text/plain:
schema:
type: string
example: 'Returns a combined text file containing all requested tokenized documents.'
404:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Some documents were not found or do not belong to you.'
missing_document_ids:
- 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
properties:
message:
type: string
example: 'Some documents were not found or do not belong to you.'
missing_document_ids:
type: array
example:
- 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
items:
type: string
422:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Some documents are not ready for download.'
not_ready_documents:
-
id: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
status: processing
error_message: null
properties:
message:
type: string
example: 'Some documents are not ready for download.'
not_ready_documents:
type: array
example:
-
id: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
status: processing
error_message: null
items:
type: object
properties:
id:
type: string
example: 9d3a5c8e-4b2f-4a1e-8c3d-5e6f7a8b9c0d
status:
type: string
example: processing
error_message:
type: string
example: null
nullable: true
tags:
- Endpoints
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
document_ids:
type: array
description: 'Must be a valid UUID.'
example:
- 6ff8f7f6-1eb3-3525-be4a-3932c805afed
items:
type: string
separator:
type: string
description: 'Must not be greater than 1000 characters.'
example: g
nullable: true
required:
- document_ids
'/api/v1/detokenize/{token}':
get:
summary: "Get a single token's original PII value."
operationId: getASingleTokensOriginalPIIValue
description: ''
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Token retrieved successfully.'
data:
token_id: 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
pii_value: john.doe@example.com
properties:
message:
type: string
example: 'Token retrieved successfully.'
data:
type: object
properties:
token_id:
type: string
example: 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
pii_value:
type: string
example: john.doe@example.com
404:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Token not found or access denied.'
properties:
message:
type: string
example: 'Token not found or access denied.'
tags:
- Endpoints
parameters:
-
in: path
name: token
description: ''
example: architecto
required: true
schema:
type: string
/api/v1/detokenize:
post:
summary: 'Detokenize text content by replacing tokens with original PII values.'
operationId: detokenizeTextContentByReplacingTokensWithOriginalPIIValues
description: ''
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Content detokenized successfully.'
data:
detokenized_content: 'Contact John Doe at john.doe@example.com or (555) 123-4567.'
tokens_found:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
tokens_replaced:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
tokens_failed: []
total_detokenizations: 2
properties:
message:
type: string
example: 'Content detokenized successfully.'
data:
type: object
properties:
detokenized_content:
type: string
example: 'Contact John Doe at john.doe@example.com or (555) 123-4567.'
tokens_found:
type: array
example:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
items:
type: string
tokens_replaced:
type: array
example:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
items:
type: string
tokens_failed:
type: array
example: []
total_detokenizations:
type: integer
example: 2
tags:
- Endpoints
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
file:
type: string
description: ''
example: null
content:
type: string
description: 'Must not be greater than 10485760 characters.'
example: b
required:
- content
/api/v1/detokenize/file:
post:
summary: 'Detokenize uploaded text/csv files by replacing tokens with original PII values.'
operationId: detokenizeUploadedTextcsvFilesByReplacingTokensWithOriginalPIIValues
description: ''
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'File detokenized successfully.'
data:
detokenized_content: 'Contact John Doe at john.doe@example.com or (555) 123-4567.'
tokens_found:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
tokens_replaced:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
tokens_failed: []
total_detokenizations: 2
properties:
message:
type: string
example: 'File detokenized successfully.'
data:
type: object
properties:
detokenized_content:
type: string
example: 'Contact John Doe at john.doe@example.com or (555) 123-4567.'
tokens_found:
type: array
example:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
items:
type: string
tokens_replaced:
type: array
example:
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
items:
type: string
tokens_failed:
type: array
example: []
total_detokenizations:
type: integer
example: 2
tags:
- Endpoints
requestBody:
required: true
content:
multipart/form-data:
schema:
type: object
properties:
file:
type: string
format: binary
description: 'Must be a file. Must not be greater than 10240 kilobytes.'
content:
type: string
description: ''
example: null
required:
- file
/api/v1/detokenize/tokens:
post:
summary: 'Detokenize tokens.'
operationId: detokenizeTokens
description: "Supply an array of token IDs to retrieve their original PII values.\nToken IDs follow the format: `TYPE:hash` (e.g., `PHONE_NUMBER:abc123...`).\n\nTo extract token IDs from tokenized content, use this regex pattern:\n`/\\[TOKEN:([A-Z_]+:[a-f0-9]+)\\]/g`\n\nThis will match tokens like:\n`[TOKEN:US_SSN:1d593fdba2209408e11e0384a9a257d2e058d1532ade7ac8c47e0f447b1edaaa]`\n\nAnd capture the token ID:\n`US_SSN:1d593fdba2209408e11e0384a9a257d2e058d1532ade7ac8c47e0f447b1edaaa`"
parameters: []
responses:
200:
description: ''
content:
application/json:
schema:
type: object
example:
message: 'Token retrieval completed.'
data:
tokens:
-
token: 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
value: '(555) 123-4567'
found: true
-
token: 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
value: john.doe@example.com
found: true
total_requested: 2
total_found: 2
total_not_found: 0
properties:
message:
type: string
example: 'Token retrieval completed.'
data:
type: object
properties:
tokens:
type: array
example:
-
token: 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
value: '(555) 123-4567'
found: true
-
token: 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
value: john.doe@example.com
found: true
items:
type: object
properties:
token:
type: string
example: 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
value:
type: string
example: '(555) 123-4567'
found:
type: boolean
example: true
total_requested:
type: integer
example: 2
total_found:
type: integer
example: 2
total_not_found:
type: integer
example: 0
tags:
- Endpoints
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
tokens:
type: array
description: 'Array of token IDs to retrieve.'
example:
- 'PHONE_NUMBER:07bf49bd8a8796e3e120e873e51da7156904ffdf6049c74c37da599609a7caf4'
- 'EMAIL_ADDRESS:f26ab8cf5a26d76fd04f4f7a740b0c0e596975c528cb0028a365898fae44915e'
items:
type: string
required:
- tokens
/api/openapi.json:
get:
summary: ''
operationId: getApiOpenapiJson
description: ''
parameters: []
responses:
302:
description: ''
content:
text/plain:
schema:
type: string
example: "\n\n