Skip to content

Commit 3f3cf11

Browse files
Merge pull request #1 from SyncfusionExamples/837695-ExtractTextSample
837695 -Added the Text Extraction sample and README file
2 parents efc6565 + 4b45d1d commit 3f3cf11

File tree

11 files changed

+274
-2
lines changed

11 files changed

+274
-2
lines changed

README.md

Lines changed: 136 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,136 @@
1-
# how-to-extract-text-from-a-PDF-document-in-net
2-
How to Extract Text from a PDF Document in .NET using the PDF Library
1+
# How to Extract Text from a PDF Document in .NET using the PDF Library
2+
3+
## Introduction
4+
A quick start .NET console project that shows how to extract text from a PDF document using the Syncfusion PDF Library.
5+
6+
## System requirement
7+
**Framework and SDKs**
8+
* .NET SDK (version 5.0 or later)
9+
10+
**IDEs**
11+
* Visual Studio 2019/ Visual Studio 2022
12+
13+
## Extract text from a specific page
14+
We will create a new .NET console application, add the Syncfusion PDF library package, and write the code
15+
16+
```csharp
17+
//Get stream from an existing PDF document.
18+
FileStream docStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read);
19+
//Load the PDF document.
20+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
21+
//Load the first page.
22+
PdfPageBase page = loadedDocument.Pages[0];
23+
//Extract text from first page.
24+
string extractedText = page.ExtractText();
25+
//Save the text.
26+
File.WriteAllText("Result.txt", extractedText);
27+
//Close the document.
28+
loadedDocument.Close(true);
29+
```
30+
31+
**Output Image**
32+
<img src="TextExtraction/TextExtractionSample/Output_images/Output.png" alt="output_image" width="100%" Height="Auto"/>
33+
34+
## Layout-based text extraction
35+
We will create a new .NET console application, add the Syncfusion PDF library package, and write the code
36+
37+
```csharp
38+
//Get stream from an existing PDF document.
39+
FileStream docStream = new FileStream("Invoice.pdf", FileMode.Open, FileAccess.Read);
40+
//Load the PDF document.
41+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
42+
//Load first page.
43+
PdfPageBase page = loadedDocument.Pages[0];
44+
//Extract text from first page.
45+
string extractedTexts = page.ExtractText(true);
46+
//Save the text.
47+
File.WriteAllText("data.txt", extractedTexts);
48+
//Close the document.
49+
loadedDocument.Close(true);
50+
```
51+
52+
**Output Image**
53+
<img src="TextExtraction/TextExtractionSample/Output_images/Layout-based-text-extraction.png" alt="output_image" width="100%" Height="Auto"/>
54+
55+
## Extract text from the entire PDF document
56+
We will create a new .NET console application, add the Syncfusion PDF library package, and write the code
57+
58+
```csharp
59+
//Get stream from an existing PDF document.
60+
FileStream docStream = new FileStream("Data.pdf", FileMode.Open, FileAccess.Read);
61+
//Load the PDF document.
62+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
63+
string extractedText = string.Empty;
64+
//Extract all the text from the PDF document pages.
65+
foreach (PdfLoadedPage loadedPage in loadedDocument.Pages) {
66+
extractedText += loadedPage.ExtractText();
67+
}
68+
//Save the text to file.
69+
File.WriteAllText("data.txt", extractedText);
70+
//Close the document.
71+
loadedDocument.Close(true);
72+
```
73+
74+
**Output Image**
75+
<img src="TextExtraction/TextExtractionSample/Output_images/ExtractText.png" alt="output_image" width="100%" Height="Auto"/>
76+
77+
## Extract text from predefined bounds
78+
We will create a new .NET console application, add the Syncfusion PDF library package, and write the code
79+
80+
```csharp
81+
//Get stream from an existing PDF document.
82+
FileStream docStream = new FileStream("Invoice.pdf", FileMode.Open, FileAccess.Read);
83+
//Load the PDF document.
84+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
85+
//Get the first page of the loaded PDF document.
86+
PdfPageBase page = loadedDocument.Pages[0];
87+
//Create line collection.
88+
var lineCollection = new TextLineCollection();
89+
//Extract text from the first page.
90+
page.ExtractText(out lineCollection);
91+
RectangleF textBounds = new RectangleF(474.96198f, 161.62997f, 50.040073f, 9);
92+
string invoiceNumber = "";
93+
//Get the text provided in the bounds.
94+
foreach (TextLine textLine in lineCollection.TextLine) {
95+
foreach (TextWord word in textLine.WordCollection) {
96+
if (textBounds==word.Bounds) {
97+
invoiceNumber = word.Text;
98+
break;
99+
}
100+
}
101+
}
102+
//Save the text to file.
103+
File.WriteAllText("data.txt", invoiceNumber);
104+
//Close the PDF document.
105+
loadedDocument.Close(true);
106+
```
107+
108+
**Output Image**
109+
<img src="TextExtraction/TextExtractionSample/Output_images/Extract-text-from-predefined-bounds.png" alt="output_image" width="100%" Height="Auto"/>
110+
111+
## How to run the examples
112+
* Download this project to a location in your disk.
113+
* Open the solution file using Visual Studio.
114+
* Rebuild the solution to install the required NuGet package.
115+
* Run the application.
116+
117+
## Resources
118+
* **Product page:** [Syncfusion PDF Framework](https://www.syncfusion.com/document-processing/pdf-framework/net)
119+
* **Documentation page:** [Syncfusion .NET PDF library](https://help.syncfusion.com/file-formats/pdf/overview)
120+
* **Online demo:** [Syncfusion .NET PDF library - Online demos](https://ej2.syncfusion.com/aspnetcore/PDF/CompressExistingPDF#/bootstrap5)
121+
* **Blog:** [Syncfusion .NET PDF library - Blog](https://www.syncfusion.com/blogs/category/pdf)
122+
* **Knowledge Base:** [Syncfusion .NET PDF library - Knowledge Base](https://www.syncfusion.com/kb/windowsforms/pdf)
123+
* **EBooks:** [Syncfusion .NET PDF library - EBooks](https://www.syncfusion.com/succinctly-free-ebooks)
124+
* **FAQ:** [Syncfusion .NET PDF library - FAQ](https://www.syncfusion.com/faq/)
125+
126+
## Support and feedback
127+
* For any other queries, reach our [Syncfusion support team](https://www.syncfusion.com/support/directtrac/incidents/newincident?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples) or post the queries through the [community forums](https://www.syncfusion.com/forums?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples).
128+
* Request new feature through [Syncfusion feedback portal](https://www.syncfusion.com/feedback?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples).
129+
130+
## License
131+
This is a commercial product and requires a paid license for possession or use. Syncfusion’s licensed software, including this component, is subject to the terms and conditions of [Syncfusion's EULA](https://www.syncfusion.com/eula/es/?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples). You can purchase a licnense [here](https://www.syncfusion.com/sales/products?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples) or start a free 30-day trial [here](https://www.syncfusion.com/account/manage-trials/start-trials?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples).
132+
133+
## About Syncfusion
134+
Founded in 2001 and headquartered in Research Triangle Park, N.C., Syncfusion has more than 26,000+ customers and more than 1 million users, including large financial institutions, Fortune 500 companies, and global IT consultancies.
135+
136+
Today, we provide 1600+ components and frameworks for web ([Blazor](https://www.syncfusion.com/blazor-components?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [ASP.NET Core](https://www.syncfusion.com/aspnet-core-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [ASP.NET MVC](https://www.syncfusion.com/aspnet-mvc-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [ASP.NET WebForms](https://www.syncfusion.com/jquery/aspnet-webforms-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [JavaScript](https://www.syncfusion.com/javascript-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [Angular](https://www.syncfusion.com/angular-ui-components?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [React](https://www.syncfusion.com/react-ui-components?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [Vue](https://www.syncfusion.com/vue-ui-components?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), and [Flutter](https://www.syncfusion.com/flutter-widgets?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples)), mobile ([Xamarin](https://www.syncfusion.com/xamarin-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [Flutter](https://www.syncfusion.com/flutter-widgets?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [UWP](https://www.syncfusion.com/uwp-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), and [JavaScript](https://www.syncfusion.com/javascript-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples)), and desktop development ([WinForms](https://www.syncfusion.com/winforms-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [WPF](https://www.syncfusion.com/wpf-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [WinUI(Preview)](https://www.syncfusion.com/winui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples), [Flutter](https://www.syncfusion.com/flutter-widgets?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples) and [UWP](https://www.syncfusion.com/uwp-ui-controls?utm_source=github&utm_medium=listing&utm_campaign=github-docio-examples)). We provide ready-to-deploy enterprise software for dashboards, reports, data integration, and big data processing. Many customers have saved millions in licensing fees by deploying our software.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio Version 17
4+
VisualStudioVersion = 17.6.33417.168
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TextExtractionSample", "TextExtractionSample\TextExtractionSample.csproj", "{4C54D05A-A656-444D-9835-58E5E54436BA}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{4C54D05A-A656-444D-9835-58E5E54436BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{4C54D05A-A656-444D-9835-58E5E54436BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{4C54D05A-A656-444D-9835-58E5E54436BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{4C54D05A-A656-444D-9835-58E5E54436BA}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {7A30187C-B337-4CF6-B556-3A1BBC0FDF07}
24+
EndGlobalSection
25+
EndGlobal
3.14 KB
Binary file not shown.
1.85 KB
Binary file not shown.
6.49 KB
Binary file not shown.
17.6 KB
Loading
106 KB
Loading
42.9 KB
Loading
60.7 KB
Loading
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
using Syncfusion.Pdf.Parsing;
2+
using Syncfusion.Pdf;
3+
using Syncfusion.Drawing;
4+
5+
namespace TextExtractionSample {
6+
internal class Program {
7+
static void Main(string[] args) {
8+
Syncfusion.Licensing.SyncfusionLicenseProvider.RegisterLicense("YOUR LICENSE KEY");
9+
10+
ExtractText();
11+
//Extract_Layout();
12+
//ExtractText_EntirePDF();
13+
//Extract_Bounds();
14+
15+
}
16+
/// <summary>
17+
/// Extract text from a specific page
18+
/// </summary>
19+
static void ExtractText() {
20+
//Get stream from an existing PDF document.
21+
FileStream docStream = new FileStream(Path.GetFullPath("../../../Input.pdf"), FileMode.Open, FileAccess.Read);
22+
//Load the PDF document.
23+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
24+
//Load the first page.
25+
PdfPageBase page = loadedDocument.Pages[0];
26+
//Extract text from first page.
27+
string extractedText = page.ExtractText();
28+
//Save the text.
29+
File.WriteAllText("Result.txt", extractedText);
30+
//Close the document.
31+
loadedDocument.Close(true);
32+
}
33+
/// <summary>
34+
/// Layout-based text extraction
35+
/// </summary>
36+
static void Extract_Layout() {
37+
//Get stream from an existing PDF document.
38+
FileStream docStream = new FileStream(Path.GetFullPath("../../../Invoice.pdf"), FileMode.Open, FileAccess.Read);
39+
//Load the PDF document.
40+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
41+
//Load first page.
42+
PdfPageBase page = loadedDocument.Pages[0];
43+
//Extract text from first page.
44+
string extractedTexts = page.ExtractText(true);
45+
//Save the text.
46+
File.WriteAllText("data.txt", extractedTexts);
47+
//Close the document.
48+
loadedDocument.Close(true);
49+
}
50+
/// <summary>
51+
/// Extract text from the entire PDF document
52+
/// </summary>
53+
static void ExtractText_EntirePDF() {
54+
//Get stream from an existing PDF document.
55+
FileStream docStream = new FileStream(Path.GetFullPath("../../../Data.pdf"), FileMode.Open, FileAccess.Read);
56+
//Load the PDF document.
57+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
58+
string extractedText = string.Empty;
59+
// Extract all the text from the PDF document pages.
60+
foreach (PdfLoadedPage loadedPage in loadedDocument.Pages) {
61+
extractedText += loadedPage.ExtractText();
62+
}
63+
//Save the text to file.
64+
File.WriteAllText("data.txt", extractedText);
65+
//Close the document.
66+
loadedDocument.Close(true);
67+
}
68+
/// <summary>
69+
/// Extract text from predefined bounds
70+
/// </summary>
71+
static void Extract_Bounds() {
72+
//Get stream from an existing PDF document.
73+
FileStream docStream = new FileStream(Path.GetFullPath("../../../Invoice.pdf"), FileMode.Open, FileAccess.Read);
74+
//Load the PDF document.
75+
PdfLoadedDocument loadedDocument = new PdfLoadedDocument(docStream);
76+
//Get the first page of the loaded PDF document.
77+
PdfPageBase page = loadedDocument.Pages[0];
78+
//Create line collection.
79+
var lineCollection = new TextLineCollection();
80+
//Extract text from the first page.
81+
page.ExtractText(out lineCollection);
82+
RectangleF textBounds = new RectangleF(474.96198f, 161.62997f, 50.040073f, 9);
83+
string invoiceNumber = "";
84+
//Get the text provided in the bounds.
85+
foreach (TextLine textLine in lineCollection.TextLine) {
86+
foreach (TextWord word in textLine.WordCollection) {
87+
if (textBounds==word.Bounds) {
88+
invoiceNumber = word.Text;
89+
break;
90+
}
91+
}
92+
}
93+
//Save the text.
94+
File.WriteAllText("data.txt", invoiceNumber);
95+
//Close the PDF document.
96+
loadedDocument.Close(true);
97+
}
98+
}
99+
}

0 commit comments

Comments
 (0)