Il peut y avoir un scénario pour lequel nous devons extraire le texte de documents Word afin de le rechercher dans la chaîne du document téléchargé par l'utilisateur, comme pour la recherche dans cv's/resumes. Un problème courant se pose: comment obtenir le texte. Un document Word téléchargé par l'utilisateur contient des liens utiles, mais ne résout pas le problème dans son intégralité. Nous devons obtenir le texte au moment du téléchargement et le sauvegarder dans la base de données, ce qui nous permet d'effectuer facilement une recherche dans la base de données.
Voici une classe simple qui fait le bon travail pour .doc/.docx, PHP lecteur de docx: Convertissez les fichiers MS Word Docx en texte .
class DocxConversion{
private $filename;
public function __construct($filePath) {
$this->filename = $filePath;
}
private function read_doc() {
$fileHandle = fopen($this->filename, "r");
$line = @fread($fileHandle, filesize($this->filename));
$lines = explode(chr(0x0D),$line);
$outtext = "";
foreach($lines as $thisline)
{
$pos = strpos($thisline, chr(0x00));
if (($pos !== FALSE)||(strlen($thisline)==0))
{
} else {
$outtext .= $thisline." ";
}
}
$outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$outtext);
return $outtext;
}
private function read_docx(){
$striped_content = '';
$content = '';
$Zip = Zip_open($this->filename);
if (!$Zip || is_numeric($Zip)) return false;
while ($Zip_entry = Zip_read($Zip)) {
if (Zip_entry_open($Zip, $Zip_entry) == FALSE) continue;
if (Zip_entry_name($Zip_entry) != "Word/document.xml") continue;
$content .= Zip_entry_read($Zip_entry, Zip_entry_filesize($Zip_entry));
Zip_entry_close($Zip_entry);
}// end while
Zip_close($Zip);
$content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content);
$content = str_replace('</w:r></w:p>', "\r\n", $content);
$striped_content = strip_tags($content);
return $striped_content;
}
/************************Excel sheet************************************/
function xlsx_to_text($input_file){
$xml_filename = "xl/sharedStrings.xml"; //content file name
$Zip_handle = new ZipArchive;
$output_text = "";
if(true === $Zip_handle->open($input_file)){
if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index);
$xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output_text = strip_tags($xml_handle->saveXML());
}else{
$output_text .="";
}
$Zip_handle->close();
}else{
$output_text .="";
}
return $output_text;
}
/*************************power point files*****************************/
function pptx_to_text($input_file){
$Zip_handle = new ZipArchive;
$output_text = "";
if(true === $Zip_handle->open($input_file)){
$slide_number = 1; //loop through slide files
while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index);
$xml_handle = DOMDocument::loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output_text .= strip_tags($xml_handle->saveXML());
$slide_number++;
}
if($slide_number == 1){
$output_text .="";
}
$Zip_handle->close();
}else{
$output_text .="";
}
return $output_text;
}
public function convertToText() {
if(isset($this->filename) && !file_exists($this->filename)) {
return "File Not exists";
}
$fileArray = pathinfo($this->filename);
$file_ext = $fileArray['extension'];
if($file_ext == "doc" || $file_ext == "docx" || $file_ext == "xlsx" || $file_ext == "pptx")
{
if($file_ext == "doc") {
return $this->read_doc();
} elseif($file_ext == "docx") {
return $this->read_docx();
} elseif($file_ext == "xlsx") {
return $this->xlsx_to_text();
}elseif($file_ext == "pptx") {
return $this->pptx_to_text();
}
} else {
return "Invalid File Type";
}
}
}
Document_file_format Les fichiers Doc sont des blobs binaires.Ils peuvent être lus en utilisant fopen . Tandis que les fichiers .docx ne sont que des fichiers Zip et xml des fichiers xml dans un conteneur zipfile (source wikipedia) vous pouvez lisez-les en utilisant Zip_open .
Utilisation de la classe ci-dessus
$docObj = new DocxConversion("test.doc");
//$docObj = new DocxConversion("test.docx");
//$docObj = new DocxConversion("test.xlsx");
//$docObj = new DocxConversion("test.pptx");
echo $docText= $docObj->convertToText();
À partir du fichier DOC
$filename = 'ypue file';
if ( file_exists($filename) ) {
if ( ($fh = fopen($filename, 'r')) !== false ) {
$headers = fread($fh, 0xA00);
$n1 = ( ord($headers[0x21C]) - 1 );
$n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );
$n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );
$n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );
$textLength = ($n1 + $n2 + $n3 + $n4);
$extracted_plaintext = fread($fh, $textLength);
echo nl2br($extracted_plaintext);
print_r(extract_emails_from($extracted_plaintext));
}
}
function extract_emails_from($string) {
preg_match_all("/[\._a-zA-Z0-9-]+@[\._a-zA-Z0-9-]+/i", $string, $matches);
return $matches[0];
}
De DOCX:
/*Name of the document file*/
$document = 'your file';
/**Function to extract text*/
function extracttext($filename) {
//Check for extension
$ext = end(explode('.', $filename));
//if its docx file
if($ext == 'docx')
$dataFile = "Word/document.xml";
//else it must be odt file
else
$dataFile = "content.xml";
//Create a new Zip archive object
$Zip = new ZipArchive;
// Open the archive file
if (true === $Zip->open($filename)) {
// If successful, search for the data file in the archive
if (($index = $Zip->locateName($dataFile)) !== false) {
// Index found! Now read it to a string
$text = $Zip->getFromIndex($index);
// Load XML from a string
// Ignore errors and warnings
$xml = DOMDocument::loadXML($text, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
// Remove XML formatting tags and return the text
return strip_tags($xml->saveXML());
}
//Close the archive file
$Zip->close();
}
// In case of failure return a message
return "File not found";
}
echo extracttext($document);
// Pour DOCX.Si vous souhaitez conserver les espaces, prenez également soin des tables tr et tc, utilisez les codes ci-dessous: Modifiez-le à votre goût. Cos il télécharge le fichier à partir d'un distant ou local
//=========DOCX===========
function extractDocxText($url,$file_name){
$docx = get_url($url);
file_put_contents("tempf.docx",$docx);
$xml_filename = "Word/document.xml"; //content file name
$Zip_handle = new ZipArchive;
$output_text = "";
if(true === $Zip_handle->open("tempf.docx")){
if(($xml_index = $Zip_handle->locateName($xml_filename)) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index);
//file_put_contents($input_file.".xml",$xml_datas);
$replace_newlines = preg_replace('/<w:p w[0-9-Za-z]+:[a-zA-Z0-9]+="[a-zA-z"0-9 :="]+">/',"\n\r",$xml_datas);
$replace_tableRows = preg_replace('/<w:tr>/',"\n\r",$replace_newlines);
$replace_tab = preg_replace('/<w:tab\/>/',"\t",$replace_tableRows);
$replace_paragraphs = preg_replace('/<\/w:p>/',"\n\r",$replace_tab);
$replace_other_Tags = strip_tags($replace_paragraphs);
$output_text = $replace_other_Tags;
}else{
$output_text .="";
}
$Zip_handle->close();
}else{
$output_text .=" ";
}
chmod("tempf.docx", 0777); unlink(realpath("tempf.docx"));
//save to file or echo content
file_put_contents($file_name,$output_text);
echo $output_text;
}
//========PDF===========
//Requires installation in your Linux server
//Sudo su
//apt-get install xpdf
function extractPdfText($url,$PDF_fullpath_or_Filename){
$pdf = get_url($url);
file_put_contents ("temppdf.txt", $pdf);
$content = pdf2text("temppdf.txt");
chmod("temppdf.txt", 0777); unlink(realpath("temppdf.txt"));
echo $content;
file_put_contents($PDF_fullpath_or_Filename,$content);
}
//========DOC==========
function extractDocText($url,$file_name){
$doc = get_url($url);
file_put_contents ("tempf.txt", $doc);
$fileHandle = fopen("tempf.txt", "r");
$line = @fread($fileHandle, filesize("tempf.txt"));
$lines = explode(chr(0x0D),$line);
$outtext = "";
foreach($lines as $thisline){
$pos = strpos($thisline, chr(0x00));
if (($pos !== FALSE)||(strlen($thisline)==0))
{} else {$outtext .= $thisline."\n\r";}
}
$content = preg_replace('/[a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/',' ',$outtext);
//chmod("tempf.txt", 0777); unlink(realpath("tempf.txt"));
echo $content;
file_put_contents($file_name,$content);
}
//========XLSX==========
function extractXlsxText($url,$file_name){
$xlsx = get_url($url);
file_put_contents ("tempf.txt", $xlsx);
$content = "";
$dir = 'tempforxlsx';
// Unzip
$Zip = new ZipArchive();
$Zip->open("tempf.txt");
$Zip->extractTo($dir);
// Open up shared strings & the first worksheet
$strings = simplexml_load_file($dir . '/xl/sharedStrings.xml');
$sheet = simplexml_load_file($dir . '/xl/worksheets/sheet1.xml');
// Parse the rows
$xlrows = $sheet->sheetData->row;
foreach ($xlrows as $xlrow) {
$arr = array();
// In each row, grab it's value
foreach ($xlrow->c as $cell) {
$v = (string) $cell->v;
// If it has a "t" (type?) of "s" (string?), use the value to look up string value
if (isset($cell['t']) && $cell['t'] == 's') {
$s = array();
$si = $strings->si[(int) $v];
// Register & alias the default namespace or you'll get empty results in the xpath query
$si->registerXPathNamespace('n', 'http://schemas.openxmlformats.org/spreadsheetml/2006/main');
// Cat together all of the 't' (text?) node values
foreach($si->xpath('.//n:t') as $t) {
$content .= $t." ";} }
}
}
echo $content;
file_put_contents($file_name,$content);
}
//========PPT==========
function extractPptText($url,$file_name){
$ppt = file_get_contents($url);
file_put_contents ("tempf.ppt", $ppt);
$fileHandle = fopen("tempf.ppt", "r");
$line = @fread($fileHandle, filesize("tempf.ppt"));
$lines = explode(chr(0x0f),$line);
$outtext = '';
foreach($lines as $thisline) {
if (strpos($thisline, chr(0x00).chr(0x00).chr(0x00)) == 1) {
$text_line = substr($thisline, 4);
$end_pos = strpos($text_line, chr(0x00));
$text_line = substr($text_line, 0, $end_pos);
$text_line = preg_replace('/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/'," ",$text_line);
$outtext = substr($text_line, 0, $end_pos)."\n".$outtext;
}
}
//echo $outtext;
file_put_contents($file_name,$outtext);
}
//========PPTX==========
function extractPptxText($url,$file_name){
$xls = get_url($url);
file_put_contents ("tempf.txt", $xls);
$Zip_handle = new ZipArchive;
$output_text = ' ';
if(true === $Zip_handle->open("tempf.txt")){
$slide_number = 1; //loop through slide files
while(($xml_index = $Zip_handle->locateName("ppt/slides/slide".$slide_number.".xml")) !== false){
$xml_datas = $Zip_handle->getFromIndex($xml_index); // these four lines of codes
// below were
$xml_handle = new DOMDocument (); // added by me in order
$xml_handle->preserveWhiteSpace = true; // to preserve space between
$xml_handle->formatOutput = true; // each read data
$xml_handle->loadXML($xml_datas, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$output_text .= $xml_handle->saveXML();
$slide_number++;
}
if($slide_number == 1){
$output_text .= "";
}
$Zip_handle->close();
}else{
$output_text .= "";
}
echo $output_text;
file_put_contents($file_name,$output_text);
}
/*
==========================================================================
=========================================================================
And below is get_url() function: Better than fie_get_contents();
*/
function get_url( $url,$timeout = 5 )
{
$url = str_replace( "&", "&", urldecode(trim($url)) );
$ch = curl_init();
curl_setopt( $ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3) Gecko/20041001 Firefox/0.10.1" );
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
curl_setopt( $ch, CURLOPT_ENCODING, "" );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); # required for https urls
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
$content = curl_exec( $ch );
//$response = curl_getinfo( $ch );
curl_close ( $ch );
return $content;
}
Pour les documents docx, je suggère l’utilisation de l’outil docx2txt
(disponible au moins sur Debian/Ubuntu):
docx2txt < your_file.docx
README
expliquer comment l’intégrer à vim. Ajouter à votre .vimrc
:
" use docx2txt.pl to allow VIm to view the text content of a .docx file directly.
autocmd BufReadPre *.docx set ro
autocmd BufReadPost *.docx %!docx2txt
(cela explique aussi comment intégrer emacs).
Pour les pirates, cet outil est écrit en Perl.