J'essaie d'utiliser Python pour traiter certains PDF formulaires remplis et signés à l'aide d'Adobe Acrobat Reader.
J'ai essayé:
Je peux continuer à chercher des bibliothèques et à les essayer, mais j'espère que quelqu'un a déjà une solution efficace à cet égard.
Mise à jour: Sur la base de la réponse de Steven, je me suis penché sur pdfminer.
from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdftypes import resolve1, PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field):
"""Recursively load form fields"""
form = field.get('Kids', None)
if form:
return [load_fields(resolve1(f)) for f in form]
else:
# Some field types, like signatures, need extra resolving
return (field.get('T').decode('utf-16'), resolve1(field.get('V')))
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
parser.add_argument('-o', '--out', help='Write output to file',
default=None, metavar='FILE')
parser.add_argument('-p', '--pickle', action='store_true', default=False,
help='Format output for python consumption')
return parser.parse_args()
def main():
args = parse_cli()
form = load_form(args.file)
if args.out:
with open(args.out, 'w') as outfile:
if args.pickle:
pickle.dump(form, outfile)
else:
pp = pprint.PrettyPrinter(indent=2)
file.write(pp.pformat(form))
else:
if args.pickle:
print pickle.dumps(form)
else:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(form)
if __== '__main__':
main()
Vous devriez être capable de le faire avec pdfminer , mais cela nécessitera de plonger dans les composants internes de pdfminer et de connaître le format pdf (formes bien sûr, mais aussi les structures internes de pdf comme "dictionnaires" et " objets indirects ").
Cet exemple pourrait vous aider sur votre chemin (je pense que cela ne fonctionnera que sur des cas simples, sans champs imbriqués, etc.).
import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
filename = sys.argv[1]
fp = open(filename, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
for i in fields:
field = resolve1(i)
name, value = field.get('T'), field.get('V')
print '{0}: {1}'.format(name, value)
EDIT: oublié de mentionner: si vous devez fournir un mot de passe, transmettez-le à doc.initialize()
Python 3.6+:
pip install PyPDF2
# -*- coding: utf-8 -*-
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader
def _getFields(obj, tree=None, retval=None, fileobj=None):
"""
Extracts field data if this PDF contains interactive form fields.
The *tree* and *retval* parameters are for recursive use.
:param fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
:return: A dictionary where each key is a field name, and each
value is a :class:`Field<PyPDF2.generic.Field>` object. By
default, the mapping name is used for keys.
:rtype: dict, or ``None`` if form data could not be located.
"""
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._checkKids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.getObject()
obj._buildField(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfFileReader(open(infile, 'rb'))
fields = _getFields(infile)
return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
if __== '__main__':
from pprint import pprint
pdf_file_name = 'FormExample.pdf'
pprint(get_form_fields(pdf_file_name))
Mise à jour de la dernière version de pdf mineur (importation de modifications et configuration de l'analyseur syntaxique/doc dans la première fonction)
from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.pdftypes import PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument(parser)
parser.set_document(doc)
#doc.set_parser(parser)
doc.initialize()
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field):
"""Recursively load form fields"""
form = field.get('Kids', None)
if form:
return [load_fields(resolve1(f)) for f in form]
else:
# Some field types, like signatures, need extra resolving
return (field.get('T').decode('utf-8'), resolve1(field.get('V')))
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
parser.add_argument('-o', '--out', help='Write output to file',
default=None, metavar='FILE')
parser.add_argument('-p', '--pickle', action='store_true', default=False,
help='Format output for python consumption')
return parser.parse_args()
def main():
args = parse_cli()
form = load_form(args.file)
if args.out:
with open(args.out, 'w') as outfile:
if args.pickle:
pickle.dump(form, outfile)
else:
pp = pprint.PrettyPrinter(indent=2)
file.write(pp.pformat(form))
else:
if args.pickle:
print pickle.dumps(form)
else:
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(form)
if __== '__main__':
main()
Travail rapide et sale de 2 minutes; utilisez simplement PDFminer pour convertir PDF en XML, puis récupérez tous les champs.
from xml.etree import ElementTree
from pprint import pprint
import os
def main():
print "Calling PDFDUMP.py"
os.system("dumppdf.py -a FILE.pdf > out.xml")
# Preprocess the file to eliminate bad XML.
print "Screening the file"
o = open("output.xml","w") #open for append
for line in open("out.xml"):
line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info.
o.write(line)
o.close()
print "Opening XML output"
tree = ElementTree.parse('output.xml')
lastnode = ""
lastnode2 = ""
list = {}
entry = {}
for node in tree.iter(): # Run through the tree..
# Check if New node
if node.tag == "key" and node.text == "T":
lastnode = node.tag + node.text
Elif lastnode == "keyT":
for child in node.iter():
entry["ID"] = child.text
lastnode = ""
if node.tag == "key" and node.text == "V":
lastnode2 = node.tag + node.text
Elif lastnode2 == "keyV":
for child in node.iter():
if child.tag == "string":
if entry.has_key("ID"):
entry["Value"] = child.text
list[entry["ID"]] = entry["Value"]
entry = {}
lastnode2 = ""
pprint(list)
if __== '__main__':
main()
Ce n'est pas joli, c'est juste une simple preuve de concept. Je dois l'implémenter pour un système sur lequel je travaille, je vais donc le nettoyer, mais je pensais le poster au cas où quelqu'un le jugerait utile.
Le paquet Python PyPDF2 (successeur de pyPdf) est très pratique:
import PyPDF2
f = PyPDF2.PdfFileReader('form.pdf')
ff = f.getFields()
Alors ff
est une dict
qui contient toutes les informations de formulaire pertinentes.
Il y a une faute de frappe sur ces lignes:
file.write(pp.pformat(form))
Devrait être:
outfile.write(pp.pformat(form))