I'm using Build 160 of Pro SDK for text extraction by PXCp_ET_GetElement. In general it works fine but now I got a pdf that gives strange results:
the text element has equal offsets for more then one characters:
characters: H,i,l,b,e,r,t,s,p,a,c,e,#0
Offset: 0,0,0,0,0.296,0.296,0.296,3.36,3.36,3.36,3.36,3.36,3.36
This is the relating part of the PDF: TD[(Hilb)-27(ert)-311(space)]
So is this a bug? And if not: how can I determine the position of the third character (l)?
Attached you find a sample page with this problem (look for first appearance of 'Hilbert')
Here's the code I'm using to get a text element:
Code: Select all
var
hr: HResult;
begin
Result := False;
// Speicherbedarf Textelement bestimmen
ATextElement.cbSize := SizeOf(PXP_TextElement);
ATextElement.Count := 0;
ATextElement.mask := 0;
hr := PXCp_ET_GetElement(FDocument, AIndex, @ATextElement, 0);
if ((not IS_DS_FAILED(hr)) and (ATextElement.Count > 0)) then
begin
ATextElement.mask := PTEM_Text + PTEM_Offsets + PTEM_Matrix +
PTEM_FontInfo + PTEM_TextParams;
ATextElement.Characters := nil;
ATextElement.Offsets := nil;
SetLength(ATextElement.Characters, ATextElement.Count);
SetLength(ATextElement.Offsets, ATextElement.Count);
// Textelement auslesen
if AIgnorePageRotation then
hr := PXCp_ET_GetElement(FDocument, AIndex, @ATextElement, GTEF_IgnorePageRotation)
else
hr := PXCp_ET_GetElement(FDocument, AIndex, @ATextElement, 0);
Result := not IS_DS_FAILED(hr);
end;
end;
best regards,
Ulrich