0. Introducción

A mi las librerias de Apache siempre me han dado confianza... pero a veces se quedan anticuadas y las nuevas versiones tardan en salir.

Es por ello que cuando estas dan problemas se recurre a librerías alternativas.

En este caso PDF Box ha sacado una nueva versión que parece ser que va muy bien

Por tanto se adjunta una clase de utilizades que entre otras cosas:

Trocea un pdf en grupos de "n" páginas
Extraer todo el texto de un pdf
Extraer el texto que hay dentro de una región rectangular

Todo esto ya estaba hecho en post anteriores pero con la libreria itext

Veamos la clase java

package openadmin.utils;

import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.List;

import javax.swing.JFileChooser;
import javax.swing.filechooser.FileNameExtensionFilter;

import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

/**
 * @author FJGimeno 09/04/2021
 *
 */
public class PDFUtils {
	
	/**
	 * Split a pdf file into groups of nPages and
	 * @param file
	 * @param nPages
	 */
	public static void splitPdf(File file, int nPages) {
		try {
			PDDocument sourceDoc;
			sourceDoc = PDDocument.load(file);
			int start = 0, end = 0;
			if (sourceDoc.getNumberOfPages() > nPages) {
				try {
					Splitter splitter = new Splitter();
					// Indicates how many pages each "cut" from the document will have .
					// (if you have a document of 500 pages and split it in documents of 5 pages
					// each, you will have 250 documents of 2 pages).
					splitter.setSplitAtPage(nPages);
					List<PDDocument> splittedList = splitter.split(sourceDoc);
					start = 1;
					end = nPages;
					for (PDDocument doc : splittedList) {
						/*
						 * Saves the document with the added route and name, it is best to give it an
						 * absolute route, in this example, i've used the source file to get the parent
						 * folder's route, also, generates a name for the new file based on the name of
						 * the original name removing it's extension
						 * ("file.getName().substring(0, file.getName().length() - 4)"), then, gives it
						 * a number to differenciate each document, and lastly, gives it an extension.
						 */
						doc.save(file.getParent() + System.getProperty("file.separator")
								+ file.getName().substring(0, file.getName().length() - 4) + "_" + start + "-"
								+ (start + (doc.getNumberOfPages() - 1)) + ".pdf");
						// this variables are used to give each generated file a proper name (Ex:
						// document_1-5.pdf, where 1 is "start" and 5 is "end")
						// they will increase their value each time the loop does on cycle.
						start = end + 1;
						end = end + nPages;
						// Closes the document once each pdf is saved
						doc.close();
					}
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
	}

	/**
	 * Reads the text included within an area in the pdf
	 * @param file the file to read
	 * @param page the page in the document
	 * @param x coordinate x
	 * @param y coordinate y
	 * @param width
	 * @param height
	 * @return The text (not formatted)
	 */
	public static String readPDFArea(File file, int page, int x, int y, int width, int height) {
		PDDocument document;
		try {
			document = PDDocument.load(file);
			PDFTextStripperByArea textStripper = new PDFTextStripperByArea();
	        Rectangle2D rect = new java.awt.geom.Rectangle2D.Float(x, y, width, height);
	        textStripper.addRegion("region", rect);
	        PDPage docPage = document.getPage(page);

	        textStripper.extractRegions(docPage);
	        String textForRegion = textStripper.getTextForRegion("region");

	        return textForRegion;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			return e.toString();
		}
	}

	/**
	 * Read all the document and get the text 
	 * @param file To read
	 * @return The text (not formatted)
	 */
	public static String readPDF(File file) {
		PDDocument document;
		String text = "Initial Value";
		try {
			document = PDDocument.load(file);
			PDFTextStripper pdfStripper = new PDFTextStripper();
			text = pdfStripper.getText(document);
			document.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return text;
	}

	
	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		JFileChooser fc = new JFileChooser();

		fc.setFileFilter(new FileNameExtensionFilter("Documento (PDF)", "pdf"));
		int returnVal = fc.showOpenDialog(null);

		if (returnVal == JFileChooser.APPROVE_OPTION) {
			// Step 1
			File file = fc.getSelectedFile();
			if (!file.getName().contains(".pdf")) {
				throw new Exception("Selected incorrect file type!");
			} else {
				System.out.println(readPDF(file));
				System.out.println("\n-------------------------------------------------------------\n");
				
				//Crida per conseguir DNI i NOM
				// NIF
				System.out.println("NIF: " + readPDFArea(file, 0, 30, 75, 110, 10));
				// Name
				System.out.println("Name: " + readPDFArea(file, 0, 90, 75, 600, 10));
				
				
				splitPdf(file, 2);
				
				
			}
		} else {
			System.out.println("Operation canceled by the user");
		}
	}

}

Informatica Dantesca

Páginas

viernes, 9 de abril de 2021

PDF con PDFBOX (5) Rediseñando con la libreria de Apache

0. Introducción

No hay comentarios :

Publicar un comentario