J'ai utilisé ce code avec grand succès pour extraire la première image trouvée dans chaque page d'un PDF. Cependant, il ne fonctionne plus avec certains nouveaux PDF pour une raison inconnue. J'ai utilisé d'autres outils (Datalogics, etc.) qui permettent de bien extraire les images avec ces nouveaux fichiers PDF. Cependant, je ne veux pas acheter Datalogics ni aucun outil si je peux utiliser iTextSharp. Quelqu'un peut-il me dire pourquoi ce code ne trouve pas les images dans le PDF?
Connu: mes PDF n'ont qu'une image par page et rien d'autre.
using iTextSharp.text;
using iTextSharp.text.pdf;
...
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (PdfName.IMAGE.Equals(type))
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
break;
}
}
}
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}
J'ai trouvé que mon problème était que je ne cherchais pas récursivement à l'intérieur des formulaires et des groupes des images. Fondamentalement, le code d'origine ne trouverait que des images incorporées à la racine du document pdf. Voici la méthode révisée plus une nouvelle méthode (FindImageInPDFDictionary) qui recherche récursivement des images dans la page. REMARQUE: les défauts de ne prendre en charge que les images JPEG et non compressées s'appliquent toujours. Voir le code de R Ubben pour les options pour corriger ces défauts. HTH quelqu'un.
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
// recursively search pages, forms and groups for images.
PdfObject obj = FindImageInPDFDictionary(pg);
if (obj != null)
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}
private static PdfObject FindImageInPDFDictionary(PdfDictionary pg)
{
PdfDictionary res =
(PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj =
(PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type =
(PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
//image at the root of the pdf
if (PdfName.IMAGE.Equals(type))
{
return obj;
}// image inside a form
else if (PdfName.FORM.Equals(type))
{
return FindImageInPDFDictionary(tg);
} //image inside a group
else if (PdfName.GROUP.Equals(type))
{
return FindImageInPDFDictionary(tg);
}
}
}
}
return null;
}
Voici une solution plus simple:
iTextSharp.text.pdf.parser.PdfImageObject pdfImage =
new iTextSharp.text.pdf.parser.PdfImageObject(imgPRStream);
System.Drawing.Image img = pdfImage.GetDrawingImage();
Le code suivant incorpore toutes les idées de Dave et R Ubben ci-dessus, en plus il renvoie une liste complète de toutes les images et traite également de plusieurs profondeurs de bits. J'ai dû le convertir en VB pour le projet sur lequel je travaille, désolé pour ça ...
Private Sub getAllImages(ByVal dict As pdf.PdfDictionary, ByVal images As List(Of Byte()), ByVal doc As pdf.PdfReader)
Dim res As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(dict.Get(pdf.PdfName.RESOURCES)), pdf.PdfDictionary)
Dim xobj As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(res.Get(pdf.PdfName.XOBJECT)), pdf.PdfDictionary)
If xobj IsNot Nothing Then
For Each name As pdf.PdfName In xobj.Keys
Dim obj As pdf.PdfObject = xobj.Get(name)
If (obj.IsIndirect) Then
Dim tg As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(obj), pdf.PdfDictionary)
Dim subtype As pdf.PdfName = CType(pdf.PdfReader.GetPdfObject(tg.Get(pdf.PdfName.SUBTYPE)), pdf.PdfName)
If pdf.PdfName.IMAGE.Equals(subtype) Then
Dim xrefIdx As Integer = CType(obj, pdf.PRIndirectReference).Number
Dim pdfObj As pdf.PdfObject = doc.GetPdfObject(xrefIdx)
Dim str As pdf.PdfStream = CType(pdfObj, pdf.PdfStream)
Dim bytes As Byte() = pdf.PdfReader.GetStreamBytesRaw(CType(str, pdf.PRStream))
Dim filter As String = tg.Get(pdf.PdfName.FILTER).ToString
Dim width As String = tg.Get(pdf.PdfName.WIDTH).ToString
Dim height As String = tg.Get(pdf.PdfName.HEIGHT).ToString
Dim bpp As String = tg.Get(pdf.PdfName.BITSPERCOMPONENT).ToString
If filter = "/FlateDecode" Then
bytes = pdf.PdfReader.FlateDecode(bytes, True)
Dim pixelFormat As System.Drawing.Imaging.PixelFormat
Select Case Integer.Parse(bpp)
Case 1
pixelFormat = Drawing.Imaging.PixelFormat.Format1bppIndexed
Case 24
pixelFormat = Drawing.Imaging.PixelFormat.Format24bppRgb
Case Else
Throw New Exception("Unknown pixel format " + bpp)
End Select
Dim bmp As New System.Drawing.Bitmap(Int32.Parse(width), Int32.Parse(height), pixelFormat)
Dim bmd As System.Drawing.Imaging.BitmapData = bmp.LockBits(New System.Drawing.Rectangle(0, 0, Int32.Parse(width), Int32.Parse(height)), System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat)
Marshal.Copy(bytes, 0, bmd.Scan0, bytes.Length)
bmp.UnlockBits(bmd)
Using ms As New MemoryStream
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png)
bytes = ms.GetBuffer
End Using
End If
images.Add(bytes)
ElseIf pdf.PdfName.FORM.Equals(subtype) Or pdf.PdfName.GROUP.Equals(subtype) Then
getAllImages(tg, images, doc)
End If
End If
Next
End If
End Sub
C'est juste une autre refonte des idées des autres, mais celle qui a fonctionné pour moi. Ici, j'utilise l'extrait de capture d'image de @ Malco avec le bouclage de R Ubben:
private IList<System.Drawing.Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc)
{
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
PdfDictionary res = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
PdfDictionary xobj = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)(PdfReader.GetPdfObject(obj));
PdfName subtype = (PdfName)(PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)));
if (PdfName.IMAGE.Equals(subtype))
{
int xrefIdx = ((PRIndirectReference)obj).Number;
PdfObject pdfObj = doc.GetPdfObject(xrefIdx);
PdfStream str = (PdfStream)(pdfObj);
iTextSharp.text.pdf.parser.PdfImageObject pdfImage =
new iTextSharp.text.pdf.parser.PdfImageObject((PRStream)str);
System.Drawing.Image img = pdfImage.GetDrawingImage();
images.Add(img);
}
else if (PdfName.FORM.Equals(subtype) || PdfName.GROUP.Equals(subtype))
{
images.AddRange(GetImagesFromPdfDict(tg, doc));
}
}
}
}
return images;
}
Version de c #:
private IList<System.Drawing.Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc){
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
PdfDictionary res = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
PdfDictionary xobj = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)(PdfReader.GetPdfObject(obj));
pdf.PdfName subtype = (pdf.PdfName)(pdf.PdfReader.GetPdfObject(tg.Get(pdf.PdfName.SUBTYPE)));
if (pdf.PdfName.IMAGE.Equals(subtype))
{
int xrefIdx = ((pdf.PRIndirectReference)obj).Number;
pdf.PdfObject pdfObj = doc.GetPdfObject(xrefIdx);
pdf.PdfStream str = (pdf.PdfStream)(pdfObj);
byte[] bytes = pdf.PdfReader.GetStreamBytesRaw((pdf.PRStream)str);
string filter = tg.Get(pdf.PdfName.FILTER).ToString();
string width = tg.Get(pdf.PdfName.WIDTH).ToString();
string height = tg.Get(pdf.PdfName.HEIGHT).ToString();
string bpp = tg.Get(pdf.PdfName.BITSPERCOMPONENT).ToString();
if (filter == "/FlateDecode")
{
bytes = pdf.PdfReader.FlateDecode(bytes, true);
System.Drawing.Imaging.PixelFormat pixelFormat;
switch (int.Parse(bpp))
{
case 1:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format1bppIndexed;
break;
case 24:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
var bmp = new System.Drawing.Bitmap(Int32.Parse(width), Int32.Parse(height), pixelFormat);
System.Drawing.Imaging.BitmapData bmd = bmp.LockBits(new System.Drawing.Rectangle(0, 0, Int32.Parse(width),
Int32.Parse(height)), System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat);
Marshal.Copy(bytes, 0, bmd.Scan0, bytes.Length);
bmp.UnlockBits(bmd);
using (var ms = new MemoryStream())
{
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
bytes = ms.GetBuffer();
}
}
images.Add(System.Drawing.Image.FromStream(new MemoryStream(bytes)));
}
else if (pdf.PdfName.FORM.Equals(subtype) || pdf.PdfName.GROUP.Equals(subtype))
{
images.AddRange(GetImagesFromPdfDict(tg, doc));
}
}
}
}
return images;
}
Ce qui précède ne fonctionnera qu'avec les JPEG. À l'exclusion des images en ligne et des fichiers incorporés, vous devez parcourir les objets du sous-type IMAGE, puis regarder le filtre et prendre les mesures appropriées. Voici un exemple, en supposant que nous avons un PdfObject de sous-type IMAGE:
PdfReader pdf = new PdfReader("c:\\temp\\exp0.pdf");
int xo=pdf.XrefSize;
for (int i=0;i<xo;i++)
{
PdfObject obj=pdf.GetPdfObject(i);
if (obj!=null && obj.IsStream())
{
PdfDictionary pd=(PdfDictionary)obj;
if (pd.Contains(PdfName.SUBTYPE) && pd.Get(PdfName.SUBTYPE).ToString()=="/Image")
{
string filter=pd.Get(PdfName.FILTER).ToString();
string width=pd.Get(PdfName.WIDTH).ToString();
string height=pd.Get(PdfName.HEIGHT).ToString();
string bpp=pd.Get(PdfName.BITSPERCOMPONENT).ToString();
string extent=".";
byte [] img=null;
switch (filter)
{
case "/FlateDecode":
byte[] arr=PdfReader.FlateDecode(PdfReader.GetStreamBytesRaw((PRStream)obj),true);
Bitmap bmp=new Bitmap(Int32.Parse(width),Int32.Parse(height),PixelFormat.Format24bppRgb);
BitmapData bmd=bmp.LockBits(new Rectangle(0,0,Int32.Parse(width),Int32.Parse(height)),ImageLockMode.WriteOnly,
PixelFormat.Format24bppRgb);
Marshal.Copy(arr,0,bmd.Scan0,arr.Length);
bmp.UnlockBits(bmd);
bmp.Save("c:\\temp\\bmp1.png",ImageFormat.Png);
break;
default:
break;
}
}
}
}
Cela gâchera la couleur à cause du Microsoft BGR, bien sûr, mais je voulais être bref. Faites quelque chose de similaire pour "/ CCITTFaxDecode", etc.