Skip to content

Commit 3be870d

Browse files
authored
Merge pull request #987 from Labelbox/VB/document-entity-annotation_AL-5106
Vb/document entity annotation al 5106
2 parents 44f843a + 0b4590f commit 3be870d

File tree

17 files changed

+870
-664
lines changed

17 files changed

+870
-664
lines changed

examples/annotation_import/pdf.ipynb

Lines changed: 662 additions & 654 deletions
Large diffs are not rendered by default.

labelbox/data/annotation_types/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from .annotation import VideoObjectAnnotation
1212

1313
from .ner import TextEntity
14+
from .ner import DocumentEntity
15+
from .ner import DocumentTextSelection
1416

1517
from .classification import Checklist
1618
from .classification import ClassificationAnswer

labelbox/data/annotation_types/annotation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .classification import Checklist, Dropdown, Radio, Text
77
from .feature import FeatureSchema
88
from .geometry import Geometry, Rectangle, Point
9-
from .ner import TextEntity
9+
from .ner import DocumentEntity, TextEntity
1010

1111

1212
class BaseAnnotation(FeatureSchema, abc.ABC):
@@ -51,7 +51,7 @@ class ObjectAnnotation(BaseAnnotation, ConfidenceMixin):
5151
classifications (Optional[List[ClassificationAnnotation]]): Optional sub classification of the annotation
5252
extra (Dict[str, Any])
5353
"""
54-
value: Union[TextEntity, Geometry]
54+
value: Union[TextEntity, DocumentEntity, Geometry]
5555
classifications: List[ClassificationAnnotation] = []
5656

5757

labelbox/data/annotation_types/label.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import labelbox
88
from labelbox.data.annotation_types.data.tiled_image import TiledImageData
9+
from labelbox.data.annotation_types.ner import DocumentEntity
910
from labelbox.schema import ontology
1011
from .annotation import (ClassificationAnnotation, ObjectAnnotation,
1112
VideoClassificationAnnotation, VideoObjectAnnotation)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .text_entity import TextEntity
2+
from .document_entity import DocumentEntity, DocumentTextSelection
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from typing import List
2+
3+
from pydantic import BaseModel, validator
4+
5+
from labelbox.utils import _CamelCaseMixin
6+
7+
8+
class DocumentTextSelection(_CamelCaseMixin, BaseModel):
9+
token_ids: List[str]
10+
group_id: str
11+
page: int
12+
13+
@validator("page")
14+
def validate_page(cls, v):
15+
if v < 1:
16+
raise ValueError("Page must be greater than 1")
17+
return v
18+
19+
20+
class DocumentEntity(_CamelCaseMixin, BaseModel):
21+
""" Represents a text entity """
22+
name: str
23+
text_selections: List[DocumentTextSelection]

labelbox/data/serialization/ndjson/converter.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def serialize(
4040
Returns:
4141
A generator for accessing the ndjson representation of the data
4242
"""
43+
4344
for example in NDLabel.from_common(labels):
4445
res = example.dict(by_alias=True)
4546
for k, v in list(res.items()):

labelbox/data/serialization/ndjson/objects.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from labelbox.data.annotation_types.data.video import VideoData
1313

1414
from ...annotation_types.data import ImageData, TextData, MaskData
15-
from ...annotation_types.ner import TextEntity
15+
from ...annotation_types.ner import DocumentEntity, DocumentTextSelection, TextEntity
1616
from ...annotation_types.types import Cuid
1717
from ...annotation_types.geometry import Rectangle, Polygon, Line, Point, Mask
1818
from ...annotation_types.annotation import ClassificationAnnotation, ObjectAnnotation, VideoObjectAnnotation
@@ -372,6 +372,33 @@ def from_common(cls,
372372
confidence=confidence)
373373

374374

375+
class NDDocumentEntity(NDBaseObject, ConfidenceMixin):
376+
name: str
377+
text_selections: List[DocumentTextSelection]
378+
379+
def to_common(self) -> DocumentEntity:
380+
return DocumentEntity(name=self.name,
381+
text_selections=self.text_selections)
382+
383+
@classmethod
384+
def from_common(cls,
385+
document_entity: DocumentEntity,
386+
classifications: List[ClassificationAnnotation],
387+
name: str,
388+
feature_schema_id: Cuid,
389+
extra: Dict[str, Any],
390+
data: Union[ImageData, TextData],
391+
confidence: Optional[float] = None) -> "NDDocumentEntity":
392+
393+
return cls(text_selections=document_entity.text_selections,
394+
dataRow=DataRow(id=data.uid),
395+
name=name,
396+
schema_id=feature_schema_id,
397+
uuid=extra.get('uuid'),
398+
classifications=classifications,
399+
confidence=confidence)
400+
401+
375402
class NDObject:
376403

377404
@staticmethod
@@ -434,7 +461,8 @@ def lookup_object(
434461
Polygon: NDPolygon,
435462
Rectangle: NDRectangle,
436463
Mask: NDMask,
437-
TextEntity: NDTextEntity
464+
TextEntity: NDTextEntity,
465+
DocumentEntity: NDDocumentEntity,
438466
}.get(type(annotation.value))
439467
if result is None:
440468
raise TypeError(
@@ -444,6 +472,6 @@ def lookup_object(
444472

445473

446474
NDObjectType = Union[NDLine, NDPolygon, NDPoint, NDRectangle, NDMask,
447-
NDTextEntity]
475+
NDTextEntity, NDDocumentEntity]
448476

449477
NDFrameObjectType = NDFrameRectangle, NDFramePoint, NDFrameLine

tests/assets/arxiv-pdf_data_99-word-token-pdfs_0801.3483-lb-textlayer.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)