Examples

Changing partition strategy for a PDF

Here’s how you can modify partition strategy for a pdf file, and select an alternative model to use with Unstructured API. The hi_res strategy supports different models, and the default is detectron2onnx.

 curl -X 'POST' <YOUR_API_URL> \
  -H 'accept: application/json' \
  -H 'Content-Type: multipart/form-data' \
  -H 'unstructured-api-key: <YOUR_API_KEY>' \
  -F 'files=@sample-docs/layout-parser-paper.pdf' \
  -F 'strategy=hi_res' \
  -F 'hi_res_model_name=chipper'

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

client = UnstructuredClient(
    api_key_auth="YOUR_API_KEY",
    server_url="YOUR_API_URL",
)

filename = "sample-docs/layout-parser-paper.pdf"
file = open(filename, "rb")
req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    strategy="hi_res",
    hi_res_model_name="chipper",
)

try:
    res = client.general.partition(req)
    print(res.elements[0])
except SDKError as e:
    print(e)

import { UnstructuredClient } from "unstructured-client";
import { PartitionResponse } from "unstructured-client/dist/sdk/models/operations";
import * as fs from "fs";

const key = "YOUR_API_KEY";

const client = new UnstructuredClient({
    serverURL: "YOUR_API_URL",
    security: {
        apiKeyAuth: key,
    },
});

const filename = "sample-docs/layout-parser-paper.pdf";
const data = fs.readFileSync(filename);

client.general.partition({
    files: {
        content: data,
        fileName: filename,
    },
    strategy: "hi_res",
    hiResModelName: "chipper",
}).then((res: PartitionResponse) => {
    if (res.statusCode == 200) {
        console.log(res.elements);
    }
}).catch((e) => {
    console.log(e.statusCode);
    console.log(e.body);
});

If you have a local deployment of the Unstructured API, you can use other supported models, such as yolox.

Specifying the language of a document for better OCR results

For better OCR results, you can specify what languages your document is in using the languages parameter. See the Tesseract documentation for a full list of languages.

curl -X 'POST' <YOUR_API_URL> \
  -H 'accept: application/json' \
  -H 'Content-Type: multipart/form-data' \
  -H 'unstructured-api-key: <YOUR_API_KEY>' \
  -F 'files=@sample-docs/korean.png' \
  -F 'strategy=hi_res' \
  -F 'languages=kor'

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

client = UnstructuredClient(
    api_key_auth="YOUR_API_KEY",
    server_url="YOUR_API_URL",
)

filename = "sample-docs/korean.png"
file = open(filename, "rb")
req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    strategy="hi_res",
    languages=["kor"],
)

try:
    res = client.general.partition(req)
    print(res.elements[0])
except SDKError as e:
    print(e)

import { UnstructuredClient } from "unstructured-client";
import { PartitionResponse } from "unstructured-client/dist/sdk/models/operations";
import * as fs from "fs";

const key = "YOUR_API_KEY";

const client = new UnstructuredClient({
    serverURL: "YOUR_API_URL",
    security: {
        apiKeyAuth: key,
    },
});

const filename = "sample-docs/korean.png";
const data = fs.readFileSync(filename);

client.general.partition({
    files: {
        content: data,
        fileName: filename,
    },
    strategy: "hi_res",
    languages: ["kor"],
}).then((res: PartitionResponse) => {
    if (res.statusCode == 200) {
        console.log(res.elements);
    }
}).catch((e) => {
    console.log(e.statusCode);
    console.log(e.body);
});

Saving bounding box coordinates

When elements are extracted from PDFs or images, it may be useful to get their bounding boxes as well. Set the coordinates parameter to true to add this field to the elements in the response.

 curl -X 'POST' <YOUR_API_URL> \
  -H 'accept: application/json' \
  -H 'Content-Type: multipart/form-data' \
  -H 'unstructured-api-key: <YOUR_API_KEY>' \
  -F 'files=@sample-docs/layout-parser-paper.pdf' \
  -F 'coordinates=true'

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

client = UnstructuredClient(
    api_key_auth="YOUR_API_KEY",
    server_url="YOUR_API_URL",
)

filename = "sample-docs/layout-parser-paper.pdf"
file = open(filename, "rb")
req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    coordinates=True,
)

try:
    res = client.general.partition(req)
    print(res.elements[0])
except SDKError as e:
    print(e)

import { UnstructuredClient } from "unstructured-client";
import { PartitionResponse } from "unstructured-client/dist/sdk/models/operations";
import * as fs from "fs";

const key = "YOUR_API_KEY";

const client = new UnstructuredClient({
    serverURL: "YOUR_API_URL",
    security: {
        apiKeyAuth: key,
    },
});

const filename = "sample-docs/layout-parser-paper.pdf";
const data = fs.readFileSync(filename);

client.general.partition({
    files: {
        content: data,
        fileName: filename,
    },
    coordinates: true,
}).then((res: PartitionResponse) => {
    if (res.statusCode == 200) {
        console.log(res.elements);
    }
}).catch((e) => {
    console.log(e.statusCode);
    console.log(e.body);
});

Returning unique element IDs

By default, the element ID is a SHA-256 hash of the element text. This is to ensure that the ID is deterministic. One downside is that the ID is not guaranteed to be unique. Different elements with the same text will have the same ID, and there could also be hash collisions. To use UUIDs in the output instead, set unique_element_ids=true. Note: this means that the element IDs will be random, so with every partition of the same file, you will get different IDs. This can be helpful if you’d like to use the IDs as a primary key in a database, for example.

curl -X 'POST' <YOUR_API_URL> \
 -H 'accept: application/json'  \
 -H 'Content-Type: multipart/form-data' \
 -H 'unstructured-api-key: <YOUR_API_KEY>' \
 -F 'files=@sample-docs/layout-parser-paper-fast.pdf' \
 -F 'unique_element_ids=true'

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

client = UnstructuredClient(
    api_key_auth="YOUR_API_KEY",
    server_url="YOUR_API_URL",
)

filename = "sample-docs/layout-parser-paper-fast.pdf"
file = open(filename, "rb")
req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    unique_element_ids=True,
)

try:
    res = client.general.partition(req)
    print(res.elements[0])
except SDKError as e:
    print(e)

import { UnstructuredClient } from "unstructured-client";
import { PartitionResponse } from "unstructured-client/dist/sdk/models/operations";
import * as fs from "fs";

const key = "YOUR_API_KEY";

const client = new UnstructuredClient({
    serverURL: "YOUR_API_URL",
    security: {
        apiKeyAuth: key,
    },
});

const filename = "sample-docs/layout-parser-paper-fast.pdf";
const data = fs.readFileSync(filename);

client.general.partition({
    files: {
        content: data,
        fileName: filename,
    },
    uniqueElementIds: true,
}).then((res: PartitionResponse) => {
    if (res.statusCode == 200) {
        console.log(res.elements);
    }
}).catch((e) => {
    console.log(e.statusCode);
    console.log(e.body);
});

Adding the chunking step after partitioning

You can combine partitioning and subsequent chunking in a single request by setting the chunking_strategy parameter. By default, the chunking_strategy is set to None, and no chunking is performed.

curl -X 'POST' <YOUR_API_URL> \
 -H 'accept: application/json'  \
 -H 'Content-Type: multipart/form-data' \
 -H 'unstructured-api-key: <YOUR_API_KEY>' \
 -F 'files=@sample-docs/layout-parser-paper-fast.pdf' \
 -F 'chunking_strategy=by_title' \
 -F 'max_characters=1024'

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

client = UnstructuredClient(
    api_key_auth="YOUR_API_KEY",
    server_url="YOUR_API_URL",
)

filename = "sample-docs/layout-parser-paper-fast.pdf"
file = open(filename, "rb")
req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    chunking_strategy="by_title",
    max_characters=1024,
)

try:
    res = client.general.partition(req)
    print(res.elements[0])
except SDKError as e:
    print(e)

import { UnstructuredClient } from "unstructured-client";
import { PartitionResponse } from "unstructured-client/dist/sdk/models/operations";
import * as fs from "fs";

const key = "YOUR_API_KEY";

const client = new UnstructuredClient({
    serverURL: "YOUR_API_URL",
    security: {
        apiKeyAuth: key,
    },
});

const filename = "sample-docs/layout-parser-paper-fast.pdf";
const data = fs.readFileSync(filename);

client.general.partition({
    files: {
        content: data,
        fileName: filename,
    },
    chunkingStrategy: "by_title",
    maxCharacters: 1024,
}).then((res: PartitionResponse) => {
    if (res.statusCode == 200) {
        console.log(res.elements);
    }
}).catch((e) => {
    console.log(e.statusCode);
    console.log(e.body);
});

Unstructured API Services

Getting Started With API Services

Using Unstructured API

Concepts

Endpoints

Changing partition strategy for a PDF

Specifying the language of a document for better OCR results

Saving bounding box coordinates

Returning unique element IDs

Adding the chunking step after partitioning

​Changing partition strategy for a PDF

​Specifying the language of a document for better OCR results

​Saving bounding box coordinates

​Returning unique element IDs

​Adding the chunking step after partitioning

Changing partition strategy for a PDF

Specifying the language of a document for better OCR results

Saving bounding box coordinates

Returning unique element IDs

Adding the chunking step after partitioning