Knowledgebase

Back to Articles List

PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

Question:

PDF-Tools features functionality that locates specified text and adds hyperlinks to it. Is this functionality available in the PDF-Tools SDK?

Answer:

Yes, this functionality is available in the PDF-Tools SDK. 

Several functions are required to perform this operation. Firstly, a set of PXCp_ET_... functions is required to extract the text content and their positions. Secondly, a PXC_pAddLink function is required to add external links to the text, once the postions are identified. (If the links are located in the same document then the PXC_AddGotoAction function must be used for this purpose).

The most complex part of the operation is the recognition and extraction of the text. This is because the text in PDF documents can take several different forms. Text-based elements may appear identical on screen but have different represenations at the file level. For example, a single word on the screen can be represented in several parts at the file level, spaces may be ommitted and the word ordering may be nonsensical until it is presented on the screen).  

Follow the steps below, for each document page, in order to enable this functionality:

1. Call PXCp_ET_AnalyzePageContent to detect text on the specified page.

2. Call PXCp_ET_GetElementCount to determine how many text elements are on the page.

3. Use the PXC_ET_GetElement function to retrieve each element.

4. Analyze the page text in order to detect the required words. N.b. it may be necessary to recompose pages at this point.

5. Add links to the page as desired and then release the acquired data.

A sample code is detailed below that extracts specified text and places it into a new PDF file. (Please ignore the calls to PXC...functions. They are related to new PDF file creation).

   // Example shows how to extract all text from the document

   // and save it to another document retaining formatting

   

   void ExtractTextToOtherPDFDocument(PDFDocument hDoc, LPCWSTR OtherPDFFileName)

   {

      HRESULT hr = DS_OK;

      

      // Preparsing document

      

      hr = PXCp_ET_Prepare(hDoc);

      if (IS_DS_FAILED(hr))

      {

         // report error

         ...

      }

            

      DWORD   fontCount = 0;

      DWORD*   fontIDs = NULL;

      PXCDocument hDstDoc = NULL;

      

      do 

      {

         // 1. Get all fonts from the doc

         hr = PXCp_ET_GetFontCount(hDoc, &fontCount);

         if (IS_DS_FAILED(hr))

            break;

         // 2. Create new doc

         hr = PXC_NewDocument(&hDstDoc, NULL, NULL);

         if(IS_DS_FAILED(hr))

            break;

         fontIDs = new DWORD[fontCount];

         ::ZeroMemory(fontIDs, fontCount * sizeof(DWORD));

         

         // buffer for font name + font style

         LPWSTR   fontName = NULL;

         DWORD   curFontNameLen = 0;

         

         for (DWORD i = 0; i < fontCount; i++)

         {

            DWORD bufLen = 0;

            // get font name length

            // if it is equal to 1 then there is no font name

            // "1" is null-terminator in this case

            hr = PXCp_ET_GetFontName(hDoc, i, NULL, &bufLen);

            if (IS_DS_FAILED(hr))

               break;

            // Check if the font have any name set

            BOOL bNoFontNameSet = bufLen <= 1;

            // if there is no font name default 'Arial' will be used

            static LPCWSTR DefaultFontName = L"Arial";

            if (bNoFontNameSet)

               bufLen = 6; // ::lstrlenW(DefaultFontName)

            // Get the length of font style

            DWORD StyleLen = 0;

            hr = PXCp_ET_GetFontStyle(hDoc, i, NULL, &StyleLen);

            // Check if there is font style sºõ

            if (StyleLen <= 1)

               StyleLen = 0;

            if (IS_DS_SUCCESSFUL(hr) && StyleLen)

            {

               // if there is font style set - then adjust the buffer length

               bufLen += StyleLen;

            }

            // Check for necessary buffer

            if (bufLen > curFontNameLen)

            {

               if (fontName)

                  delete[] fontName;

               fontName = new WCHAR[bufLen];

               curFontNameLen = bufLen;

            }

            if (bNoFontNameSet)

            {

               // if there is no file name - copy default name

               ::lstrcpy(fontName, DefaultFontName);

            }

            else

            {

               // else aquire font name from the library

               DWORD tempBufLen = bufLen;

               hr = PXCp_ET_GetFontName(hDoc, i, fontName, &tempBufLen);

               if (IS_DS_FAILED(hr))

                  break;

            }

            if (StyleLen)

            {

               // if there is font style set - acquire it

               hr = PXCp_ET_GetFontStyle(hDoc, i, fontName + (bufLen - StyleLen) - 1, &StyleLen);

            }

            // add the font into library

            hr = PXC_AddFontW(hDstDoc, FW_NORMAL, FALSE, fontName, fontIDs + i);

            if (IS_DS_FAILED(hr))

               break;

         }

         // clean unnecessary buffer

         if (fontName)

         {

            delete[] fontName;

            fontName = NULL;

         }

         if (IS_DS_FAILED(hr))

         {

            break;

         }

         DWORD PageCnt = 0;

         hr = PXCp_GetPagesCount(hDoc, &PageCnt);

         if (IS_DS_FAILED(hr) || !PageCnt)

            break;

         // 3. for each page

         for (DWORD CurPage = 0; CurPage < PageCnt; CurPage++)

         {

            // create new page in the new document

            PXC_RectF rcMediaBox;

            PXC_RectF rcCropBox;

            LONG nAngle;

 

            hr = PXCp_PageGetBox(hDoc, CurPage, PB_MediaBox, &rcMediaBox);

            if(IS_DS_FAILED(hr))

               break;

            // add to the new page

            PXCPage hDstPage = NULL;

            hr = PXC_AddPage(hDstDoc, rcMediaBox.right - rcMediaBox.left, rcMediaBox.top - rcMediaBox.bottom, &hDstPage);

            if(IS_DS_FAILED(hr))

               break;

 

            hr = PXCp_PageGetBox(hDoc, CurPage, PB_CropBox, &rcCropBox);

            if(IS_DS_SUCCESSFUL(hr))

            {

               hr = PXC_SetPageBox(hDstPage, PB_CropBox, &rcCropBox);

            }

            hr = PXCp_PageGetRotate(hDoc, CurPage, &nAngle);

            if(IS_DS_SUCCESSFUL(hr) && nAngle)

            {

               hr = PXC_SetPageRotation(hDstPage, nAngle);

            }

            PXC_TextOptions pto = { sizeof(PXC_TextOptions) };

            PXC_GetTextOptions(hDstPage, &pto);

            pto.nTextPosition = TextPosition_Baseline;

            PXC_SetTextOptions(hDstPage, &pto);

            //   for each element

            hr = PXCp_ET_AnalyzePageContent(hDoc, CurPage);

            if(IS_DS_FAILED(hr))

               break;

 

            DWORD TextElCount = 0;

            hr = PXCp_ET_GetElementCount(hDoc, &TextElCount);

            if(IS_DS_FAILED(hr) || TextElCount == 0)

               continue;

 

            PXP_TextElement TextElement = {0};

            TextElement.cbSize = sizeof(PXP_TextElement);

            DWORD CurCount = 0;

 

            PXC_PointF ptTextOrg = {0};

            WCHAR buf[2];

            buf[0] = buf[1] = 0;

 

            for (DWORD t = 0; t < TextElCount; t++)

            {

               TextElement.Count = 0;

               TextElement.mask = 0;

               hr = PXCp_ET_GetElement(hDoc, t, &TextElement, 0);

               if(IS_DS_FAILED(hr) || (LONG)TextElement.Count <= 0)

                  continue;

               TextElement.mask = PTEM_Text | PTEM_Offsets | PTEM_Matrix |   PTEM_FontInfo | PTEM_TextParams;

               if (CurCount < TextElement.Count)

               {

                  if (TextElement.Characters != NULL)

                     delete TextElement.Characters;

                  if (TextElement.Offsets != NULL)

                     delete TextElement.Offsets;

                  TextElement.Characters = new WCHAR[TextElement.Count];

                  TextElement.Offsets = new double[TextElement.Count];

                  CurCount = TextElement.Count;

               }

               hr = PXCp_ET_GetElement(hDoc, t, &TextElement, GTEF_IgnorePageRotation);

               if (IS_DS_FAILED(hr))

                  continue;

               // Now add this text element into new PDF document

               hr = PXC_TCS_Transform(hDstPage, &TextElement.Matrix);

 

               if (fontCount <= TextElement.FontIndex)

                  continue;

 

               hr = PXC_SetCurrentFont(hDstPage, fontIDs[TextElement.FontIndex], TextElement.FontSize);

                           hr = PXC_SetFillColor(hDstPage, TextElement.FillColor);

               hr = PXC_SetStrokeColor(hDstPage, TextElement.StrokeColor);

               hr = PXC_SetTextRMode(hDstPage, TextElement.RenderingMode, NULL);

               hr = PXC_SetTextScaling(hDstPage, TextElement.Th, NULL);

               hr = PXC_SetTextLeading(hDstPage, TextElement.Leading, NULL);

               hr = PXC_SetCharSpacing(hDstPage, TextElement.CharSpace, NULL);

               hr = PXC_SetWordSpacing(hDstPage, TextElement.WordSpace, NULL);

               

               for(DWORD j = 0; j < TextElement.Count - 1; j++)

               {

                  ptTextOrg.x = TextElement.Offsets[j];

                  buf[0] = TextElement.Characters[j];

                  hr = PXC_TextOutW(hDstPage, &ptTextOrg, buf, 1);

               }

            }

            if (TextElement.Characters != NULL)

               delete[] TextElement.Characters;

            if (TextElement.Offsets != NULL)

               delete[] TextElement.Offsets;

         }

         if (IS_DS_FAILED(hr))

            break;

      

         hr = PXC_WriteDocumentExW(hDstDoc, OtherPDFFileName, -1, WEF_ShowSaveDialog | WEF_RunApp, NULL);

         

      } while(FALSE);

 

      // clear up

      if (hDstDoc)

      {

         PXC_ReleaseDocument(hDstDoc);

         hDstDoc = NULL;

      }

      if (fontIDs)

      {

         delete[] fontIDs;

         fontIDs = NULL;

      }

      PXCp_ET_Finish(hDoc);

   }

 
Was this article helpful?
Yes No Somewhat