0. Introducción
A mi las librerias de Apache siempre me han dado confianza... pero a veces se quedan anticuadas y las nuevas versiones tardan en salir.
Es por ello que cuando estas dan problemas se recurre a librerías alternativas.
En este caso PDF Box ha sacado una nueva versión que parece ser que va muy bien
Por tanto se adjunta una clase de utilizades que entre otras cosas:
- Trocea un pdf en grupos de "n" páginas
- Extraer todo el texto de un pdf
- Extraer el texto que hay dentro de una región rectangular
Todo esto ya estaba hecho en post anteriores pero con la libreria itext
Veamos la clase java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | package openadmin.utils; import java.awt.geom.Rectangle2D; import java.io.File; import java.io.IOException; import java.util.List; import javax.swing.JFileChooser; import javax.swing.filechooser.FileNameExtensionFilter; import org.apache.pdfbox.multipdf.Splitter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripperByArea; /** * @author FJGimeno 09/04/2021 * */ public class PDFUtils { /** * Split a pdf file into groups of nPages and * @param file * @param nPages */ public static void splitPdf(File file, int nPages) { try { PDDocument sourceDoc; sourceDoc = PDDocument.load(file); int start = 0, end = 0; if (sourceDoc.getNumberOfPages() > nPages) { try { Splitter splitter = new Splitter(); // Indicates how many pages each "cut" from the document will have . // (if you have a document of 500 pages and split it in documents of 5 pages // each, you will have 250 documents of 2 pages). splitter.setSplitAtPage(nPages); List<PDDocument> splittedList = splitter.split(sourceDoc); start = 1; end = nPages; for (PDDocument doc : splittedList) { /* * Saves the document with the added route and name, it is best to give it an * absolute route, in this example, i've used the source file to get the parent * folder's route, also, generates a name for the new file based on the name of * the original name removing it's extension * ("file.getName().substring(0, file.getName().length() - 4)"), then, gives it * a number to differenciate each document, and lastly, gives it an extension. */ doc.save(file.getParent() + System.getProperty("file.separator") + file.getName().substring(0, file.getName().length() - 4) + "_" + start + "-" + (start + (doc.getNumberOfPages() - 1)) + ".pdf"); // this variables are used to give each generated file a proper name (Ex: // document_1-5.pdf, where 1 is "start" and 5 is "end") // they will increase their value each time the loop does on cycle. start = end + 1; end = end + nPages; // Closes the document once each pdf is saved doc.close(); } } catch (Exception e) { e.printStackTrace(); } } } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } /** * Reads the text included within an area in the pdf * @param file the file to read * @param page the page in the document * @param x coordinate x * @param y coordinate y * @param width * @param height * @return The text (not formatted) */ public static String readPDFArea(File file, int page, int x, int y, int width, int height) { PDDocument document; try { document = PDDocument.load(file); PDFTextStripperByArea textStripper = new PDFTextStripperByArea(); Rectangle2D rect = new java.awt.geom.Rectangle2D.Float(x, y, width, height); textStripper.addRegion("region", rect); PDPage docPage = document.getPage(page); textStripper.extractRegions(docPage); String textForRegion = textStripper.getTextForRegion("region"); return textForRegion; } catch (IOException e) { // TODO Auto-generated catch block return e.toString(); } } /** * Read all the document and get the text * @param file To read * @return The text (not formatted) */ public static String readPDF(File file) { PDDocument document; String text = "Initial Value"; try { document = PDDocument.load(file); PDFTextStripper pdfStripper = new PDFTextStripper(); text = pdfStripper.getText(document); document.close(); } catch (IOException e) { e.printStackTrace(); } return text; } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // TODO Auto-generated method stub JFileChooser fc = new JFileChooser(); fc.setFileFilter(new FileNameExtensionFilter("Documento (PDF)", "pdf")); int returnVal = fc.showOpenDialog(null); if (returnVal == JFileChooser.APPROVE_OPTION) { // Step 1 File file = fc.getSelectedFile(); if (!file.getName().contains(".pdf")) { throw new Exception("Selected incorrect file type!"); } else { System.out.println(readPDF(file)); System.out.println("\n-------------------------------------------------------------\n"); //Crida per conseguir DNI i NOM // NIF System.out.println("NIF: " + readPDFArea(file, 0, 30, 75, 110, 10)); // Name System.out.println("Name: " + readPDFArea(file, 0, 90, 75, 600, 10)); splitPdf(file, 2); } } else { System.out.println("Operation canceled by the user"); } } } |
No hay comentarios :
Publicar un comentario