-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodels.py
More file actions
158 lines (107 loc) · 3.82 KB
/
models.py
File metadata and controls
158 lines (107 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from __future__ import annotations
from typing import Any, Dict, List, Literal, Optional, Union
from pydantic import BaseModel, Field
class JsonSchema(BaseModel):
name: str
schema_definition: Dict[str, Any]
description: Optional[str] = None
strict: bool = False
class ResponseFormat(BaseModel):
type: Literal["text", "json_object", "json_schema"] = "text"
json_schema: Optional[JsonSchema] = None
class OCRImageObject(BaseModel):
id: str
top_left_x: Optional[int] = None
top_left_y: Optional[int] = None
bottom_right_x: Optional[int] = None
bottom_right_y: Optional[int] = None
image_annotation: Optional[str] = None
image_base64: Optional[str] = None
class OCRTableObject(BaseModel):
id: str
format: Literal["markdown", "html"]
content: str
class OCRPageDimensions(BaseModel):
dpi: int
width: int
height: int
class OCRPageObject(BaseModel):
index: int
markdown: str
images: List[OCRImageObject] = Field(default_factory=list)
tables: List[OCRTableObject] = Field(default_factory=list)
hyperlinks: List[str] = Field(default_factory=list)
header: Optional[str] = None
footer: Optional[str] = None
dimensions: Optional[OCRPageDimensions] = None
class OCRUsageInfo(BaseModel):
doc_size_bytes: Optional[int] = None
pages_processed: int
class FileChunk(BaseModel):
type: Literal["file"] = "file"
file_id: str
class DocumentURLChunk(BaseModel):
type: Literal["document_url"] = "document_url"
document_url: str
document_name: Optional[str] = None
class ImageURLChunk(BaseModel):
type: Literal["image_url"] = "image_url"
image_url: Union[str, Dict[str, Any]]
DocumentChunk = Union[FileChunk, DocumentURLChunk, ImageURLChunk]
class OCRRequest(BaseModel):
document: DocumentChunk
model: Optional[str] = None
bbox_annotation_format: Optional[ResponseFormat] = None
document_annotation_format: Optional[ResponseFormat] = None
id: Optional[str] = None
image_limit: Optional[int] = None
image_min_size: Optional[int] = None
include_image_base64: Optional[bool] = None
pages: Optional[List[int]] = None
extract_header: bool = False
extract_footer: bool = False
table_format: Optional[Literal["markdown", "html"]] = None
include_raw: bool = False
# PDF behavior
pdf_mode: Optional[Literal["auto", "ocr", "text"]] = "auto"
pdf_text_min_chars: Optional[int] = None
pdf_text_sample_pages: Optional[int] = None
extract_figures: Optional[bool] = True
# NEW: replace images with parsed figure text in markdown
inline_figure_text: Optional[bool] = True
figure_prompt: Optional[str] = None
class OCRResponse(BaseModel):
model: str
pages: List[OCRPageObject]
usage_info: OCRUsageInfo
document_annotation: Optional[str] = None
raw_model_output: Optional[str] = None
# ---------------------------
# Files API models (Mistral-compatible)
# ---------------------------
Purpose = Literal["fine-tune", "batch", "ocr"]
SampleType = Literal["pretrain", "instruct", "batch_request", "batch_result", "batch_error"]
FileSource = Literal["upload", "repository", "mistral"]
class FileSchema(BaseModel):
id: str
object: Literal["file"] = "file"
bytes: Optional[int] = None
created_at: int
filename: str
purpose: Purpose = "ocr"
sample_type: Optional[SampleType] = None
source: FileSource = "upload"
num_lines: Optional[int] = None
mimetype: Optional[str] = None
signature: Optional[str] = None
deleted: Optional[bool] = None
class FileListResponse(BaseModel):
object: Literal["list"] = "list"
data: List[FileSchema]
total: Optional[int] = None
class DeleteFileResponse(BaseModel):
id: str
object: Literal["file"] = "file"
deleted: bool
class SignedURLResponse(BaseModel):
url: str