?
Weight/Term.php 0000666 00000006174 15125175533 0007436 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* Term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Query weight
*
* @var float
*/
private $_queryWeight;
/**
* Zend_Search_Lucene_Search_Weight_Term constructor
* reader - index reader
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Index_Term $term,
Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_term = $term;
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}
Weight/Empty.php 0000666 00000003010 15125175533 0007607 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Empty extends Zend_Search_Lucene_Search_Weight
{
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
return 1;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
}
}
Weight/MultiTerm.php 0000666 00000007157 15125175533 0010453 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Query terms weights
* Array of Zend_Search_Lucene_Search_Weight_Term
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getTerms() as $id => $term) {
if ($signs === null || $signs[$id] === null || $signs[$id]) {
$this->_weights[$id] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader);
$query->setWeight($id, $this->_weights[$id]);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}
Weight/Boolean.php 0000666 00000007017 15125175533 0010103 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Boolean extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Queries weights
* Array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_Boolean constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getSubqueries() as $num => $subquery) {
if ($signs === null || $signs[$num] === null || $signs[$num]) {
$this->_weights[$num] = $subquery->createWeight($reader);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}
Weight/Phrase.php 0000666 00000005457 15125175533 0007754 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene_Search_Weight
*/
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query_Phrase
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Zend_Search_Lucene_Search_Weight_Phrase constructor
*
* @param Zend_Search_Lucene_Search_Query_Phrase $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}
QueryEntry.php 0000666 00000004160 15125175533 0007420 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry_Term */
require_once 'Zend/Search/Lucene/Search/QueryEntry/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry_Phrase */
require_once 'Zend/Search/Lucene/Search/QueryEntry/Phrase.php';
/** Zend_Search_Lucene_Search_QueryEntry_Subquery */
require_once 'Zend/Search/Lucene/Search/QueryEntry/Subquery.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_QueryEntry
{
/**
* Query entry boost factor
*
* @var float
*/
protected $_boost = 1.0;
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
abstract public function processFuzzyProximityModifier($parameter = null);
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
*/
abstract public function getQuery($encoding);
/**
* Boost query entry
*
* @param float $boostFactor
*/
public function boost($boostFactor)
{
$this->_boost *= $boostFactor;
}
}
QueryToken.php 0000666 00000016712 15125175533 0007405 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryToken
{
/**
* Token types.
*/
const TT_WORD = 0; // Word
const TT_PHRASE = 1; // Phrase (one or several quoted words)
const TT_FIELD = 2; // Field name in 'field:word', field:<phrase> or field:(<subquery>) pairs
const TT_FIELD_INDICATOR = 3; // ':'
const TT_REQUIRED = 4; // '+'
const TT_PROHIBITED = 5; // '-'
const TT_FUZZY_PROX_MARK = 6; // '~'
const TT_BOOSTING_MARK = 7; // '^'
const TT_RANGE_INCL_START = 8; // '['
const TT_RANGE_INCL_END = 9; // ']'
const TT_RANGE_EXCL_START = 10; // '{'
const TT_RANGE_EXCL_END = 11; // '}'
const TT_SUBQUERY_START = 12; // '('
const TT_SUBQUERY_END = 13; // ')'
const TT_AND_LEXEME = 14; // 'AND' or 'and'
const TT_OR_LEXEME = 15; // 'OR' or 'or'
const TT_NOT_LEXEME = 16; // 'NOT' or 'not'
const TT_TO_LEXEME = 17; // 'TO' or 'to'
const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, ....
/**
* Returns all possible lexeme types.
* It's used for syntax analyzer state machine initialization
*
* @return array
*/
public static function getTypes()
{
return array( self::TT_WORD,
self::TT_PHRASE,
self::TT_FIELD,
self::TT_FIELD_INDICATOR,
self::TT_REQUIRED,
self::TT_PROHIBITED,
self::TT_FUZZY_PROX_MARK,
self::TT_BOOSTING_MARK,
self::TT_RANGE_INCL_START,
self::TT_RANGE_INCL_END,
self::TT_RANGE_EXCL_START,
self::TT_RANGE_EXCL_END,
self::TT_SUBQUERY_START,
self::TT_SUBQUERY_END,
self::TT_AND_LEXEME,
self::TT_OR_LEXEME,
self::TT_NOT_LEXEME,
self::TT_TO_LEXEME,
self::TT_NUMBER
);
}
/**
* TokenCategories
*/
const TC_WORD = 0; // Word
const TC_PHRASE = 1; // Phrase (one or several quoted words)
const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8
const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^
/**
* Token type.
*
* @var integer
*/
public $type;
/**
* Token text.
*
* @var integer
*/
public $text;
/**
* Token position within query.
*
* @var integer
*/
public $position;
/**
* IndexReader constructor needs token type and token text as a parameters.
*
* @param integer $tokenCategory
* @param string $tokText
* @param integer $position
*/
public function __construct($tokenCategory, $tokenText, $position)
{
$this->text = $tokenText;
$this->position = $position + 1; // Start from 1
switch ($tokenCategory) {
case self::TC_WORD:
if ( strtolower($tokenText) == 'and') {
$this->type = self::TT_AND_LEXEME;
} else if (strtolower($tokenText) == 'or') {
$this->type = self::TT_OR_LEXEME;
} else if (strtolower($tokenText) == 'not') {
$this->type = self::TT_NOT_LEXEME;
} else if (strtolower($tokenText) == 'to') {
$this->type = self::TT_TO_LEXEME;
} else {
$this->type = self::TT_WORD;
}
break;
case self::TC_PHRASE:
$this->type = self::TT_PHRASE;
break;
case self::TC_NUMBER:
$this->type = self::TT_NUMBER;
break;
case self::TC_SYNTAX_ELEMENT:
switch ($tokenText) {
case ':':
$this->type = self::TT_FIELD_INDICATOR;
break;
case '+':
$this->type = self::TT_REQUIRED;
break;
case '-':
$this->type = self::TT_PROHIBITED;
break;
case '~':
$this->type = self::TT_FUZZY_PROX_MARK;
break;
case '^':
$this->type = self::TT_BOOSTING_MARK;
break;
case '[':
$this->type = self::TT_RANGE_INCL_START;
break;
case ']':
$this->type = self::TT_RANGE_INCL_END;
break;
case '{':
$this->type = self::TT_RANGE_EXCL_START;
break;
case '}':
$this->type = self::TT_RANGE_EXCL_END;
break;
case '(':
$this->type = self::TT_SUBQUERY_START;
break;
case ')':
$this->type = self::TT_SUBQUERY_END;
break;
case '!':
$this->type = self::TT_NOT_LEXEME;
break;
case '&&':
$this->type = self::TT_AND_LEXEME;
break;
case '||':
$this->type = self::TT_OR_LEXEME;
break;
default:
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized query syntax lexeme: \'' . $tokenText . '\'');
}
break;
case self::TC_NUMBER:
$this->type = self::TT_NUMBER;
default:
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized lexeme type: \'' . $tokenCategory . '\'');
}
}
}
Similarity/Default.php 0000666 00000005312 15125175533 0011003 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity
{
/**
* Implemented as '1/sqrt(numTerms)'.
*
* @param string $fieldName
* @param integer $numTerms
* @return float
*/
public function lengthNorm($fieldName, $numTerms)
{
if ($numTerms == 0) {
return 1E10;
}
return 1.0/sqrt($numTerms);
}
/**
* Implemented as '1/sqrt(sumOfSquaredWeights)'.
*
* @param float $sumOfSquaredWeights
* @return float
*/
public function queryNorm($sumOfSquaredWeights)
{
return 1.0/sqrt($sumOfSquaredWeights);
}
/**
* Implemented as 'sqrt(freq)'.
*
* @param float $freq
* @return float
*/
public function tf($freq)
{
return sqrt($freq);
}
/**
* Implemented as '1/(distance + 1)'.
*
* @param integer $distance
* @return float
*/
public function sloppyFreq($distance)
{
return 1.0/($distance + 1);
}
/**
* Implemented as 'log(numDocs/(docFreq+1)) + 1'.
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
public function idfFreq($docFreq, $numDocs)
{
return log($numDocs/(float)($docFreq+1)) + 1.0;
}
/**
* Implemented as 'overlap/maxOverlap'.
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
public function coord($overlap, $maxOverlap)
{
return $overlap/(float)$maxOverlap;
}
}
QueryParserContext.php 0000666 00000033033 15125175533 0011121 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryToken */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/** Zend_Search_Lucene_Search_Query_Term */
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/** Zend_Search_Lucene_Search_Query_Boolean */
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
/** Zend_Search_Lucene_Search_Query_Phrase */
require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
/** Zend_Search_Lucene_Search_BooleanExpressionRecognizer */
require_once 'Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryParserContext
{
/**
* Default field for the context.
*
* null means, that term should be searched through all fields
* Zend_Search_Lucene_Search_Query::rewriteQuery($index) transletes such queries to several
*
* @var string|null
*/
private $_defaultField;
/**
* Field specified for next entry
*
* @var string
*/
private $_nextEntryField = null;
/**
* True means, that term is required.
* False means, that term is prohibited.
* null means, that term is neither prohibited, nor required
*
* @var boolean
*/
private $_nextEntrySign = null;
/**
* Entries grouping mode
*/
const GM_SIGNS = 0; // Signs mode: '+term1 term2 -term3 +(subquery1) -(subquery2)'
const GM_BOOLEAN = 1; // Boolean operators mode: 'term1 and term2 or (subquery1) and not (subquery2)'
/**
* Grouping mode
*
* @var integer
*/
private $_mode = null;
/**
* Entries signs.
* Used in GM_SIGNS grouping mode
*
* @var arrays
*/
private $_signs = array();
/**
* Query entries
* Each entry is a Zend_Search_Lucene_Search_QueryEntry object or
* boolean operator (Zend_Search_Lucene_Search_QueryToken class constant)
*
* @var array
*/
private $_entries = array();
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Context object constructor
*
* @param string $encoding
* @param string|null $defaultField
*/
public function __construct($encoding, $defaultField = null)
{
$this->_encoding = $encoding;
$this->_defaultField = $defaultField;
}
/**
* Get context default field
*
* @return string|null
*/
public function getField()
{
return ($this->_nextEntryField !== null) ? $this->_nextEntryField : $this->_defaultField;
}
/**
* Set field for next entry
*
* @param string $field
*/
public function setNextEntryField($field)
{
$this->_nextEntryField = $field;
}
/**
* Set sign for next entry
*
* @param integer $sign
* @throws Zend_Search_Lucene_Exception
*/
public function setNextEntrySign($sign)
{
if ($this->_mode === self::GM_BOOLEAN) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_SIGNS;
if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED) {
$this->_nextEntrySign = true;
} else if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED) {
$this->_nextEntrySign = false;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized sign type.');
}
}
/**
* Add entry to a query
*
* @param Zend_Search_Lucene_Search_QueryEntry $entry
*/
public function addEntry(Zend_Search_Lucene_Search_QueryEntry $entry)
{
if ($this->_mode !== self::GM_BOOLEAN) {
$this->_signs[] = $this->_nextEntrySign;
}
$this->_entries[] = $entry;
$this->_nextEntryField = null;
$this->_nextEntrySign = null;
}
/**
* Process fuzzy search or proximity search modifier
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) {
// there are no entries or last entry is boolean operator
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry->processFuzzyProximityModifier($parameter);
$this->_entries[] = $lastEntry;
}
/**
* Set boost factor to the entry
*
* @param float $boostFactor
*/
public function boost($boostFactor)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) {
// there are no entries or last entry is boolean operator
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry->boost($boostFactor);
$this->_entries[] = $lastEntry;
}
/**
* Process logical operator
*
* @param integer $operator
*/
public function addLogicalOperator($operator)
{
if ($this->_mode === self::GM_SIGNS) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_BOOLEAN;
$this->_entries[] = $operator;
}
/**
* Generate 'signs style' query from the context
* '+term1 term2 -term3 +(<subquery1>) ...'
*
* @return Zend_Search_Lucene_Search_Query
*/
public function _signStyleExpressionQuery()
{
$query = new Zend_Search_Lucene_Search_Query_Boolean();
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
$defaultSign = true; // required
} else {
// Zend_Search_Lucene_Search_QueryParser::B_OR
$defaultSign = null; // optional
}
foreach ($this->_entries as $entryId => $entry) {
$sign = ($this->_signs[$entryId] !== null) ? $this->_signs[$entryId] : $defaultSign;
$query->addSubquery($entry->getQuery($this->_encoding), $sign);
}
return $query;
}
/**
* Generate 'boolean style' query from the context
* 'term1 and term2 or term3 and (<subquery1>) and not (<subquery2>)'
*
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene
*/
private function _booleanExpressionQuery()
{
/**
* We treat each level of an expression as a boolean expression in
* a Disjunctive Normal Form
*
* AND operator has higher precedence than OR
*
* Thus logical query is a disjunction of one or more conjunctions of
* one or more query entries
*/
$expressionRecognizer = new Zend_Search_Lucene_Search_BooleanExpressionRecognizer();
require_once 'Zend/Search/Lucene/Exception.php';
try {
foreach ($this->_entries as $entry) {
if ($entry instanceof Zend_Search_Lucene_Search_QueryEntry) {
$expressionRecognizer->processLiteral($entry);
} else {
switch ($entry) {
case Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_AND_OPERATOR);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_OR_OPERATOR);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_NOT_OPERATOR);
break;
default:
throw new Zend_Search_Lucene('Boolean expression error. Unknown operator type.');
}
}
}
$conjuctions = $expressionRecognizer->finishExpression();
} catch (Zend_Search_Exception $e) {
// throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error. Error message: \'' .
// $e->getMessage() . '\'.' );
// It's query syntax error message and it should be user friendly. So FSM message is omitted
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error.');
}
// Remove 'only negative' conjunctions
foreach ($conjuctions as $conjuctionId => $conjuction) {
$nonNegativeEntryFound = false;
foreach ($conjuction as $conjuctionEntry) {
if ($conjuctionEntry[1]) {
$nonNegativeEntryFound = true;
break;
}
}
if (!$nonNegativeEntryFound) {
unset($conjuctions[$conjuctionId]);
}
}
$subqueries = array();
foreach ($conjuctions as $conjuction) {
// Check, if it's a one term conjuction
if (count($conjuction) == 1) {
$subqueries[] = $conjuction[0][0]->getQuery($this->_encoding);
} else {
$subquery = new Zend_Search_Lucene_Search_Query_Boolean();
foreach ($conjuction as $conjuctionEntry) {
$subquery->addSubquery($conjuctionEntry[0]->getQuery($this->_encoding), $conjuctionEntry[1]);
}
$subqueries[] = $subquery;
}
}
if (count($subqueries) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($subqueries) == 1) {
return $subqueries[0];
}
$query = new Zend_Search_Lucene_Search_Query_Boolean();
foreach ($subqueries as $subquery) {
// Non-requirered entry/subquery
$query->addSubquery($subquery);
}
return $query;
}
/**
* Generate query from current context
*
* @return Zend_Search_Lucene_Search_Query
*/
public function getQuery()
{
if ($this->_mode === self::GM_BOOLEAN) {
return $this->_booleanExpressionQuery();
} else {
return $this->_signStyleExpressionQuery();
}
}
}
QueryHit.php 0000666 00000005303 15125175533 0007043 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryHit
{
/**
* Object handle of the index
* @var Zend_Search_Lucene_Interface
*/
protected $_index = null;
/**
* Object handle of the document associated with this hit
* @var Zend_Search_Lucene_Document
*/
protected $_document = null;
/**
* Number of the document in the index
* @var integer
*/
public $id;
/**
* Score of the hit
* @var float
*/
public $score;
/**
* Constructor - pass object handle of Zend_Search_Lucene_Interface index that produced
* the hit so the document can be retrieved easily from the hit.
*
* @param Zend_Search_Lucene_Interface $index
*/
public function __construct(Zend_Search_Lucene_Interface $index)
{
$this->_index = new Zend_Search_Lucene_Proxy($index);
}
/**
* Convenience function for getting fields from the document
* associated with this hit.
*
* @param string $offset
* @return string
*/
public function __get($offset)
{
return $this->getDocument()->getFieldValue($offset);
}
/**
* Return the document object for this hit
*
* @return Zend_Search_Lucene_Document
*/
public function getDocument()
{
if (!$this->_document instanceof Zend_Search_Lucene_Document) {
$this->_document = $this->_index->getDocument($this->id);
}
return $this->_document;
}
/**
* Return the index object for this hit
*
* @return Zend_Search_Lucene_Interface
*/
public function getIndex()
{
return $this->_index;
}
}
QueryLexer.php 0000666 00000062566 15125175533 0007414 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryLexer extends Zend_Search_Lucene_FSM
{
/** State Machine states */
const ST_WHITE_SPACE = 0;
const ST_SYNT_LEXEME = 1;
const ST_LEXEME = 2;
const ST_QUOTED_LEXEME = 3;
const ST_ESCAPED_CHAR = 4;
const ST_ESCAPED_QCHAR = 5;
const ST_LEXEME_MODIFIER = 6;
const ST_NUMBER = 7;
const ST_MANTISSA = 8;
const ST_ERROR = 9;
/** Input symbols */
const IN_WHITE_SPACE = 0;
const IN_SYNT_CHAR = 1;
const IN_LEXEME_MODIFIER = 2;
const IN_ESCAPE_CHAR = 3;
const IN_QUOTE = 4;
const IN_DECIMAL_POINT = 5;
const IN_ASCII_DIGIT = 6;
const IN_CHAR = 7;
const IN_MUTABLE_CHAR = 8;
const QUERY_WHITE_SPACE_CHARS = " \n\r\t";
const QUERY_SYNT_CHARS = ':()[]{}!|&';
const QUERY_MUTABLE_CHARS = '+-';
const QUERY_DOUBLECHARLEXEME_CHARS = '|&';
const QUERY_LEXEMEMODIFIER_CHARS = '~^';
const QUERY_ASCIIDIGITS_CHARS = '0123456789';
/**
* List of recognized lexemes
*
* @var array
*/
private $_lexemes;
/**
* Query string (array of single- or non single-byte characters)
*
* @var array
*/
private $_queryString;
/**
* Current position within a query string
* Used to create appropriate error messages
*
* @var integer
*/
private $_queryStringPosition;
/**
* Recognized part of current lexeme
*
* @var string
*/
private $_currentLexeme;
public function __construct()
{
parent::__construct( array(self::ST_WHITE_SPACE,
self::ST_SYNT_LEXEME,
self::ST_LEXEME,
self::ST_QUOTED_LEXEME,
self::ST_ESCAPED_CHAR,
self::ST_ESCAPED_QCHAR,
self::ST_LEXEME_MODIFIER,
self::ST_NUMBER,
self::ST_MANTISSA,
self::ST_ERROR),
array(self::IN_WHITE_SPACE,
self::IN_SYNT_CHAR,
self::IN_MUTABLE_CHAR,
self::IN_LEXEME_MODIFIER,
self::IN_ESCAPE_CHAR,
self::IN_QUOTE,
self::IN_DECIMAL_POINT,
self::IN_ASCII_DIGIT,
self::IN_CHAR));
$lexemeModifierErrorAction = new Zend_Search_Lucene_FSMAction($this, 'lexModifierErrException');
$quoteWithinLexemeErrorAction = new Zend_Search_Lucene_FSMAction($this, 'quoteWithinLexemeErrException');
$wrongNumberErrorAction = new Zend_Search_Lucene_FSMAction($this, 'wrongNumberErrException');
$this->addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
// IN_QUOTE not allowed
array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction),
array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR),
array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE),
array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME)
));
$this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME)
));
$this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
// IN_QUOTE not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction),
array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
// IN_CHAR not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
));
$this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
// IN_QUOTE not allowed
array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
// IN_CHAR not allowed
array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
));
$this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
// IN_QUOTE not allowed
array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
// IN_DECIMAL_POINT not allowed
array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction),
array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA),
// IN_CHAR not allowed
array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
));
/** Actions */
$syntaxLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuerySyntaxLexeme');
$lexemeModifierAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeModifier');
$addLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addLexeme');
$addQuotedLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuotedLexeme');
$addNumberLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addNumberLexeme');
$addLexemeCharAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeChar');
/** Syntax lexeme */
$this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction);
// Two lexemes in succession
$this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction);
/** Lexeme */
$this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction);
// ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action
$this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction);
/** Quoted lexeme */
// We don't need entry action (skeep quote)
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
$this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
// Closing quote changes state to the ST_WHITE_SPACE other states are not used
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction);
/** Lexeme modifier */
$this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction);
/** Number */
$this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction);
$this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction);
// ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action
$this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
}
/**
* Translate input char to an input symbol of state machine
*
* @param string $char
* @return integer
*/
private function _translateInput($char)
{
if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE;
} else if (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR;
} else if (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR;
} else if (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER;
} else if (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT;
} else if ($char === '"' ) { return self::IN_QUOTE;
} else if ($char === '.' ) { return self::IN_DECIMAL_POINT;
} else if ($char === '\\') { return self::IN_ESCAPE_CHAR;
} else { return self::IN_CHAR;
}
}
/**
* This method is used to tokenize query string into lexemes
*
* @param string $inputString
* @param string $encoding
* @return array
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function tokenize($inputString, $encoding)
{
$this->reset();
$this->_lexemes = array();
$this->_queryString = array();
if (PHP_OS == 'AIX' && $encoding == '') {
$encoding = 'ISO8859-1';
}
$strLength = iconv_strlen($inputString, $encoding);
// Workaround for iconv_substr bug
$inputString .= ' ';
for ($count = 0; $count < $strLength; $count++) {
$this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding);
}
for ($this->_queryStringPosition = 0;
$this->_queryStringPosition < count($this->_queryString);
$this->_queryStringPosition++) {
$this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition]));
}
$this->process(self::IN_WHITE_SPACE);
if ($this->getState() != self::ST_WHITE_SPACE) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Unexpected end of query');
}
$this->_queryString = null;
return $this->_lexemes;
}
/*********************************************************************
* Actions implementation
*
* Actions affect on recognized lexemes list
*********************************************************************/
/**
* Add query syntax lexeme
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function addQuerySyntaxLexeme()
{
$lexeme = $this->_queryString[$this->_queryStringPosition];
// Process two char lexemes
if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) {
// increase current position in a query string
$this->_queryStringPosition++;
// check,
if ($this->_queryStringPosition == count($this->_queryString) ||
$this->_queryString[$this->_queryStringPosition] != $lexeme) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg());
}
// duplicate character
$lexeme .= $lexeme;
}
$token = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
$lexeme,
$this->_queryStringPosition);
// Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word'
if ($token->type == Zend_Search_Lucene_Search_QueryToken::TT_FIELD_INDICATOR) {
$token = array_pop($this->_lexemes);
if ($token === null || $token->type != Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg());
}
$token->type = Zend_Search_Lucene_Search_QueryToken::TT_FIELD;
}
$this->_lexemes[] = $token;
}
/**
* Add lexeme modifier
*/
public function addLexemeModifier()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
$this->_queryString[$this->_queryStringPosition],
$this->_queryStringPosition);
}
/**
* Add lexeme
*/
public function addLexeme()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_WORD,
$this->_currentLexeme,
$this->_queryStringPosition - 1);
$this->_currentLexeme = '';
}
/**
* Add quoted lexeme
*/
public function addQuotedLexeme()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_PHRASE,
$this->_currentLexeme,
$this->_queryStringPosition);
$this->_currentLexeme = '';
}
/**
* Add number lexeme
*/
public function addNumberLexeme()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_NUMBER,
$this->_currentLexeme,
$this->_queryStringPosition - 1);
$this->_currentLexeme = '';
}
/**
* Extend lexeme by one char
*/
public function addLexemeChar()
{
$this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition];
}
/**
* Position message
*
* @return string
*/
private function _positionMsg()
{
return 'Position is ' . $this->_queryStringPosition . '.';
}
/*********************************************************************
* Syntax errors actions
*********************************************************************/
public function lexModifierErrException()
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg());
}
public function quoteWithinLexemeErrException()
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg());
}
public function wrongNumberErrException()
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wrong number syntax.' . $this->_positionMsg());
}
}
QueryParserException.php 0000666 00000002363 15125175533 0011435 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene base exception
*/
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*
* Special exception type, which may be used to intercept wrong user input
*/
class Zend_Search_Lucene_Search_QueryParserException extends Zend_Search_Lucene_Exception
{}
QueryParser.php 0000666 00000063621 15125175533 0007562 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_Query_Term */
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/** Zend_Search_Lucene_Search_Query_Boolean */
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
/** Zend_Search_Lucene_Search_Query_Phrase */
require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
/** Zend_Search_Lucene_Search_Query_Wildcard */
require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
/** Zend_Search_Lucene_Search_Query_Range */
require_once 'Zend/Search/Lucene/Search/Query/Range.php';
/** Zend_Search_Lucene_Search_Query_Fuzzy */
require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
/** Zend_Search_Lucene_Search_Query_Empty */
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
/** Zend_Search_Lucene_Search_Query_Insignificant */
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
/** Zend_Search_Lucene_Search_QueryLexer */
require_once 'Zend/Search/Lucene/Search/QueryLexer.php';
/** Zend_Search_Lucene_Search_QueryParserContext */
require_once 'Zend/Search/Lucene/Search/QueryParserContext.php';
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryParser extends Zend_Search_Lucene_FSM
{
/**
* Parser instance
*
* @var Zend_Search_Lucene_Search_QueryParser
*/
private static $_instance = null;
/**
* Query lexer
*
* @var Zend_Search_Lucene_Search_QueryLexer
*/
private $_lexer;
/**
* Tokens list
* Array of Zend_Search_Lucene_Search_QueryToken objects
*
* @var array
*/
private $_tokens;
/**
* Current token
*
* @var integer|string
*/
private $_currentToken;
/**
* Last token
*
* It can be processed within FSM states, but this addirional state simplifies FSM
*
* @var Zend_Search_Lucene_Search_QueryToken
*/
private $_lastToken = null;
/**
* Range query first term
*
* @var string
*/
private $_rqFirstTerm = null;
/**
* Current query parser context
*
* @var Zend_Search_Lucene_Search_QueryParserContext
*/
private $_context;
/**
* Context stack
*
* @var array
*/
private $_contextStack;
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Query string default encoding
*
* @var string
*/
private $_defaultEncoding = '';
/**
* Defines query parsing mode.
*
* If this option is turned on, then query parser suppress query parser exceptions
* and constructs multi-term query using all words from a query.
*
* That helps to avoid exceptions caused by queries, which don't conform to query language,
* but limits possibilities to check, that query entered by user has some inconsistencies.
*
*
* Default is true.
*
* Use {@link Zend_Search_Lucene::suppressQueryParsingExceptions()},
* {@link Zend_Search_Lucene::dontSuppressQueryParsingExceptions()} and
* {@link Zend_Search_Lucene::checkQueryParsingExceptionsSuppressMode()} to operate
* with this setting.
*
* @var boolean
*/
private $_suppressQueryParsingExceptions = true;
/**
* Boolean operators constants
*/
const B_OR = 0;
const B_AND = 1;
/**
* Default boolean queries operator
*
* @var integer
*/
private $_defaultOperator = self::B_OR;
/** Query parser State Machine states */
const ST_COMMON_QUERY_ELEMENT = 0; // Terms, phrases, operators
const ST_CLOSEDINT_RQ_START = 1; // Range query start (closed interval) - '['
const ST_CLOSEDINT_RQ_FIRST_TERM = 2; // First term in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_TO_TERM = 3; // 'TO' lexeme in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_LAST_TERM = 4; // Second term in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_END = 5; // Range query end (closed interval) - ']'
const ST_OPENEDINT_RQ_START = 6; // Range query start (opened interval) - '{'
const ST_OPENEDINT_RQ_FIRST_TERM = 7; // First term in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_TO_TERM = 8; // 'TO' lexeme in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_LAST_TERM = 9; // Second term in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_END = 10; // Range query end (opened interval) - '}'
/**
* Parser constructor
*/
public function __construct()
{
parent::__construct(array(self::ST_COMMON_QUERY_ELEMENT,
self::ST_CLOSEDINT_RQ_START,
self::ST_CLOSEDINT_RQ_FIRST_TERM,
self::ST_CLOSEDINT_RQ_TO_TERM,
self::ST_CLOSEDINT_RQ_LAST_TERM,
self::ST_CLOSEDINT_RQ_END,
self::ST_OPENEDINT_RQ_START,
self::ST_OPENEDINT_RQ_FIRST_TERM,
self::ST_OPENEDINT_RQ_TO_TERM,
self::ST_OPENEDINT_RQ_LAST_TERM,
self::ST_OPENEDINT_RQ_END
),
Zend_Search_Lucene_Search_QueryToken::getTypes());
$this->addRules(
array(array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PHRASE, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FIELD, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_BOOSTING_MARK, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_INCL_START, self::ST_CLOSEDINT_RQ_START),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_EXCL_START, self::ST_OPENEDINT_RQ_START),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_START, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_END, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NUMBER, self::ST_COMMON_QUERY_ELEMENT)
));
$this->addRules(
array(array(self::ST_CLOSEDINT_RQ_START, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_FIRST_TERM),
array(self::ST_CLOSEDINT_RQ_FIRST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_TO_LEXEME, self::ST_CLOSEDINT_RQ_TO_TERM),
array(self::ST_CLOSEDINT_RQ_TO_TERM, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_LAST_TERM),
array(self::ST_CLOSEDINT_RQ_LAST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_INCL_END, self::ST_COMMON_QUERY_ELEMENT)
));
$this->addRules(
array(array(self::ST_OPENEDINT_RQ_START, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_FIRST_TERM),
array(self::ST_OPENEDINT_RQ_FIRST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_TO_LEXEME, self::ST_OPENEDINT_RQ_TO_TERM),
array(self::ST_OPENEDINT_RQ_TO_TERM, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_LAST_TERM),
array(self::ST_OPENEDINT_RQ_LAST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_EXCL_END, self::ST_COMMON_QUERY_ELEMENT)
));
$addTermEntryAction = new Zend_Search_Lucene_FSMAction($this, 'addTermEntry');
$addPhraseEntryAction = new Zend_Search_Lucene_FSMAction($this, 'addPhraseEntry');
$setFieldAction = new Zend_Search_Lucene_FSMAction($this, 'setField');
$setSignAction = new Zend_Search_Lucene_FSMAction($this, 'setSign');
$setFuzzyProxAction = new Zend_Search_Lucene_FSMAction($this, 'processFuzzyProximityModifier');
$processModifierParameterAction = new Zend_Search_Lucene_FSMAction($this, 'processModifierParameter');
$subqueryStartAction = new Zend_Search_Lucene_FSMAction($this, 'subqueryStart');
$subqueryEndAction = new Zend_Search_Lucene_FSMAction($this, 'subqueryEnd');
$logicalOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'logicalOperator');
$openedRQFirstTermAction = new Zend_Search_Lucene_FSMAction($this, 'openedRQFirstTerm');
$openedRQLastTermAction = new Zend_Search_Lucene_FSMAction($this, 'openedRQLastTerm');
$closedRQFirstTermAction = new Zend_Search_Lucene_FSMAction($this, 'closedRQFirstTerm');
$closedRQLastTermAction = new Zend_Search_Lucene_FSMAction($this, 'closedRQLastTerm');
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_WORD, $addTermEntryAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PHRASE, $addPhraseEntryAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FIELD, $setFieldAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED, $setSignAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED, $setSignAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK, $setFuzzyProxAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NUMBER, $processModifierParameterAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_START, $subqueryStartAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_END, $subqueryEndAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME, $logicalOperatorAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME, $logicalOperatorAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME, $logicalOperatorAction);
$this->addEntryAction(self::ST_OPENEDINT_RQ_FIRST_TERM, $openedRQFirstTermAction);
$this->addEntryAction(self::ST_OPENEDINT_RQ_LAST_TERM, $openedRQLastTermAction);
$this->addEntryAction(self::ST_CLOSEDINT_RQ_FIRST_TERM, $closedRQFirstTermAction);
$this->addEntryAction(self::ST_CLOSEDINT_RQ_LAST_TERM, $closedRQLastTermAction);
$this->_lexer = new Zend_Search_Lucene_Search_QueryLexer();
}
/**
* Get query parser instance
*
* @return Zend_Search_Lucene_Search_QueryParser
*/
private static function _getInstance()
{
if (self::$_instance === null) {
self::$_instance = new self();
}
return self::$_instance;
}
/**
* Set query string default encoding
*
* @param string $encoding
*/
public static function setDefaultEncoding($encoding)
{
self::_getInstance()->_defaultEncoding = $encoding;
}
/**
* Get query string default encoding
*
* @return string
*/
public static function getDefaultEncoding()
{
return self::_getInstance()->_defaultEncoding;
}
/**
* Set default boolean operator
*
* @param integer $operator
*/
public static function setDefaultOperator($operator)
{
self::_getInstance()->_defaultOperator = $operator;
}
/**
* Get default boolean operator
*
* @return integer
*/
public static function getDefaultOperator()
{
return self::_getInstance()->_defaultOperator;
}
/**
* Turn on 'suppress query parser exceptions' mode.
*/
public static function suppressQueryParsingExceptions()
{
self::_getInstance()->_suppressQueryParsingExceptions = true;
}
/**
* Turn off 'suppress query parser exceptions' mode.
*/
public static function dontSuppressQueryParsingExceptions()
{
self::_getInstance()->_suppressQueryParsingExceptions = false;
}
/**
* Check 'suppress query parser exceptions' mode.
* @return boolean
*/
public static function queryParsingExceptionsSuppressed()
{
return self::_getInstance()->_suppressQueryParsingExceptions;
}
/**
* Parses a query string
*
* @param string $strQuery
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public static function parse($strQuery, $encoding = null)
{
self::_getInstance();
// Reset FSM if previous parse operation didn't return it into a correct state
self::$_instance->reset();
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
try {
self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding;
self::$_instance->_lastToken = null;
self::$_instance->_context = new Zend_Search_Lucene_Search_QueryParserContext(self::$_instance->_encoding);
self::$_instance->_contextStack = array();
self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding);
// Empty query
if (count(self::$_instance->_tokens) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
foreach (self::$_instance->_tokens as $token) {
try {
self::$_instance->_currentToken = $token;
self::$_instance->process($token->type);
self::$_instance->_lastToken = $token;
} catch (Exception $e) {
if (strpos($e->getMessage(), 'There is no any rule for') !== false) {
throw new Zend_Search_Lucene_Search_QueryParserException( 'Syntax error at char position ' . $token->position . '.' );
}
throw $e;
}
}
if (count(self::$_instance->_contextStack) != 0) {
throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' );
}
return self::$_instance->_context->getQuery();
} catch (Zend_Search_Lucene_Search_QueryParserException $e) {
if (self::$_instance->_suppressQueryParsingExceptions) {
$queryTokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($strQuery, self::$_instance->_encoding);
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
$termsSign = (self::$_instance->_defaultOperator == self::B_AND) ? true /* required term */ :
null /* optional term */;
foreach ($queryTokens as $token) {
$query->addTerm(new Zend_Search_Lucene_Index_Term($token->getTermText()), $termsSign);
}
return $query;
} else {
throw $e;
}
}
}
/*********************************************************************
* Actions implementation
*
* Actions affect on recognized lexemes list
*********************************************************************/
/**
* Add term to a query
*/
public function addTermEntry()
{
$entry = new Zend_Search_Lucene_Search_QueryEntry_Term($this->_currentToken->text, $this->_context->getField());
$this->_context->addEntry($entry);
}
/**
* Add phrase to a query
*/
public function addPhraseEntry()
{
$entry = new Zend_Search_Lucene_Search_QueryEntry_Phrase($this->_currentToken->text, $this->_context->getField());
$this->_context->addEntry($entry);
}
/**
* Set entry field
*/
public function setField()
{
$this->_context->setNextEntryField($this->_currentToken->text);
}
/**
* Set entry sign
*/
public function setSign()
{
$this->_context->setNextEntrySign($this->_currentToken->type);
}
/**
* Process fuzzy search/proximity modifier - '~'
*/
public function processFuzzyProximityModifier()
{
$this->_context->processFuzzyProximityModifier();
}
/**
* Process modifier parameter
*
* @throws Zend_Search_Lucene_Exception
*/
public function processModifierParameter()
{
if ($this->_lastToken === null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' );
}
switch ($this->_lastToken->type) {
case Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK:
$this->_context->processFuzzyProximityModifier($this->_currentToken->text);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_BOOSTING_MARK:
$this->_context->boost($this->_currentToken->text);
break;
default:
// It's not a user input exception
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' );
}
}
/**
* Start subquery
*/
public function subqueryStart()
{
$this->_contextStack[] = $this->_context;
$this->_context = new Zend_Search_Lucene_Search_QueryParserContext($this->_encoding, $this->_context->getField());
}
/**
* End subquery
*/
public function subqueryEnd()
{
if (count($this->_contextStack) == 0) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing. Char position ' . $this->_currentToken->position . '.' );
}
$query = $this->_context->getQuery();
$this->_context = array_pop($this->_contextStack);
$this->_context->addEntry(new Zend_Search_Lucene_Search_QueryEntry_Subquery($query));
}
/**
* Process logical operator
*/
public function logicalOperator()
{
$this->_context->addLogicalOperator($this->_currentToken->type);
}
/**
* Process first range query term (opened interval)
*/
public function openedRQFirstTerm()
{
$this->_rqFirstTerm = $this->_currentToken->text;
}
/**
* Process last range query term (opened interval)
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function openedRQLastTerm()
{
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
$from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$from = null;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
$to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$to = null;
}
if ($from === null && $to === null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term');
}
$rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, false);
$entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery);
$this->_context->addEntry($entry);
}
/**
* Process first range query term (closed interval)
*/
public function closedRQFirstTerm()
{
$this->_rqFirstTerm = $this->_currentToken->text;
}
/**
* Process last range query term (closed interval)
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function closedRQLastTerm()
{
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
$from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$from = null;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
$to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$to = null;
}
if ($from === null && $to === null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term');
}
$rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, true);
$entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery);
$this->_context->addEntry($entry);
}
}
Similarity.php 0000666 00000061332 15125175533 0007423 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Similarity_Default */
require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Similarity
{
/**
* The Similarity implementation used by default.
*
* @var Zend_Search_Lucene_Search_Similarity
*/
private static $_defaultImpl;
/**
* Cache of decoded bytes.
* Array of floats
*
* @var array
*/
private static $_normTable = array( 0 => 0.0,
1 => 5.820766E-10,
2 => 6.9849193E-10,
3 => 8.1490725E-10,
4 => 9.313226E-10,
5 => 1.1641532E-9,
6 => 1.3969839E-9,
7 => 1.6298145E-9,
8 => 1.8626451E-9,
9 => 2.3283064E-9,
10 => 2.7939677E-9,
11 => 3.259629E-9,
12 => 3.7252903E-9,
13 => 4.656613E-9,
14 => 5.5879354E-9,
15 => 6.519258E-9,
16 => 7.4505806E-9,
17 => 9.313226E-9,
18 => 1.1175871E-8,
19 => 1.3038516E-8,
20 => 1.4901161E-8,
21 => 1.8626451E-8,
22 => 2.2351742E-8,
23 => 2.6077032E-8,
24 => 2.9802322E-8,
25 => 3.7252903E-8,
26 => 4.4703484E-8,
27 => 5.2154064E-8,
28 => 5.9604645E-8,
29 => 7.4505806E-8,
30 => 8.940697E-8,
31 => 1.0430813E-7,
32 => 1.1920929E-7,
33 => 1.4901161E-7,
34 => 1.7881393E-7,
35 => 2.0861626E-7,
36 => 2.3841858E-7,
37 => 2.9802322E-7,
38 => 3.5762787E-7,
39 => 4.172325E-7,
40 => 4.7683716E-7,
41 => 5.9604645E-7,
42 => 7.1525574E-7,
43 => 8.34465E-7,
44 => 9.536743E-7,
45 => 1.1920929E-6,
46 => 1.4305115E-6,
47 => 1.66893E-6,
48 => 1.9073486E-6,
49 => 2.3841858E-6,
50 => 2.861023E-6,
51 => 3.33786E-6,
52 => 3.8146973E-6,
53 => 4.7683716E-6,
54 => 5.722046E-6,
55 => 6.67572E-6,
56 => 7.6293945E-6,
57 => 9.536743E-6,
58 => 1.1444092E-5,
59 => 1.335144E-5,
60 => 1.5258789E-5,
61 => 1.9073486E-5,
62 => 2.2888184E-5,
63 => 2.670288E-5,
64 => 3.0517578E-5,
65 => 3.8146973E-5,
66 => 4.5776367E-5,
67 => 5.340576E-5,
68 => 6.1035156E-5,
69 => 7.6293945E-5,
70 => 9.1552734E-5,
71 => 1.0681152E-4,
72 => 1.2207031E-4,
73 => 1.5258789E-4,
74 => 1.8310547E-4,
75 => 2.1362305E-4,
76 => 2.4414062E-4,
77 => 3.0517578E-4,
78 => 3.6621094E-4,
79 => 4.272461E-4,
80 => 4.8828125E-4,
81 => 6.1035156E-4,
82 => 7.324219E-4,
83 => 8.544922E-4,
84 => 9.765625E-4,
85 => 0.0012207031,
86 => 0.0014648438,
87 => 0.0017089844,
88 => 0.001953125,
89 => 0.0024414062,
90 => 0.0029296875,
91 => 0.0034179688,
92 => 0.00390625,
93 => 0.0048828125,
94 => 0.005859375,
95 => 0.0068359375,
96 => 0.0078125,
97 => 0.009765625,
98 => 0.01171875,
99 => 0.013671875,
100 => 0.015625,
101 => 0.01953125,
102 => 0.0234375,
103 => 0.02734375,
104 => 0.03125,
105 => 0.0390625,
106 => 0.046875,
107 => 0.0546875,
108 => 0.0625,
109 => 0.078125,
110 => 0.09375,
111 => 0.109375,
112 => 0.125,
113 => 0.15625,
114 => 0.1875,
115 => 0.21875,
116 => 0.25,
117 => 0.3125,
118 => 0.375,
119 => 0.4375,
120 => 0.5,
121 => 0.625,
122 => 0.75,
123 => 0.875,
124 => 1.0,
125 => 1.25,
126 => 1.5,
127 => 1.75,
128 => 2.0,
129 => 2.5,
130 => 3.0,
131 => 3.5,
132 => 4.0,
133 => 5.0,
134 => 6.0,
135 => 7.0,
136 => 8.0,
137 => 10.0,
138 => 12.0,
139 => 14.0,
140 => 16.0,
141 => 20.0,
142 => 24.0,
143 => 28.0,
144 => 32.0,
145 => 40.0,
146 => 48.0,
147 => 56.0,
148 => 64.0,
149 => 80.0,
150 => 96.0,
151 => 112.0,
152 => 128.0,
153 => 160.0,
154 => 192.0,
155 => 224.0,
156 => 256.0,
157 => 320.0,
158 => 384.0,
159 => 448.0,
160 => 512.0,
161 => 640.0,
162 => 768.0,
163 => 896.0,
164 => 1024.0,
165 => 1280.0,
166 => 1536.0,
167 => 1792.0,
168 => 2048.0,
169 => 2560.0,
170 => 3072.0,
171 => 3584.0,
172 => 4096.0,
173 => 5120.0,
174 => 6144.0,
175 => 7168.0,
176 => 8192.0,
177 => 10240.0,
178 => 12288.0,
179 => 14336.0,
180 => 16384.0,
181 => 20480.0,
182 => 24576.0,
183 => 28672.0,
184 => 32768.0,
185 => 40960.0,
186 => 49152.0,
187 => 57344.0,
188 => 65536.0,
189 => 81920.0,
190 => 98304.0,
191 => 114688.0,
192 => 131072.0,
193 => 163840.0,
194 => 196608.0,
195 => 229376.0,
196 => 262144.0,
197 => 327680.0,
198 => 393216.0,
199 => 458752.0,
200 => 524288.0,
201 => 655360.0,
202 => 786432.0,
203 => 917504.0,
204 => 1048576.0,
205 => 1310720.0,
206 => 1572864.0,
207 => 1835008.0,
208 => 2097152.0,
209 => 2621440.0,
210 => 3145728.0,
211 => 3670016.0,
212 => 4194304.0,
213 => 5242880.0,
214 => 6291456.0,
215 => 7340032.0,
216 => 8388608.0,
217 => 1.048576E7,
218 => 1.2582912E7,
219 => 1.4680064E7,
220 => 1.6777216E7,
221 => 2.097152E7,
222 => 2.5165824E7,
223 => 2.9360128E7,
224 => 3.3554432E7,
225 => 4.194304E7,
226 => 5.0331648E7,
227 => 5.8720256E7,
228 => 6.7108864E7,
229 => 8.388608E7,
230 => 1.00663296E8,
231 => 1.17440512E8,
232 => 1.34217728E8,
233 => 1.6777216E8,
234 => 2.01326592E8,
235 => 2.34881024E8,
236 => 2.68435456E8,
237 => 3.3554432E8,
238 => 4.02653184E8,
239 => 4.69762048E8,
240 => 5.3687091E8,
241 => 6.7108864E8,
242 => 8.0530637E8,
243 => 9.395241E8,
244 => 1.07374182E9,
245 => 1.34217728E9,
246 => 1.61061274E9,
247 => 1.87904819E9,
248 => 2.14748365E9,
249 => 2.68435456E9,
250 => 3.22122547E9,
251 => 3.75809638E9,
252 => 4.2949673E9,
253 => 5.3687091E9,
254 => 6.4424509E9,
255 => 7.5161928E9 );
/**
* Set the default Similarity implementation used by indexing and search
* code.
*
* @param Zend_Search_Lucene_Search_Similarity $similarity
*/
public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
{
self::$_defaultImpl = $similarity;
}
/**
* Return the default Similarity implementation used by indexing and search
* code.
*
* @return Zend_Search_Lucene_Search_Similarity
*/
public static function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
}
return self::$_defaultImpl;
}
/**
* Computes the normalization value for a field given the total number of
* terms contained in a field. These values, together with field boosts, are
* stored in an index and multipled into scores for hits on each field by the
* search code.
*
* Matches in longer fields are less precise, so implemenations of this
* method usually return smaller values when 'numTokens' is large,
* and larger values when 'numTokens' is small.
*
* That these values are computed under
* IndexWriter::addDocument(Document) and stored then using
* encodeNorm(float). Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* fieldName - name of field
* numTokens - the total number of tokens contained in fields named
* 'fieldName' of 'doc'.
* Returns a normalization factor for hits on this field of this document
*
* @param string $fieldName
* @param integer $numTokens
* @return float
*/
abstract public function lengthNorm($fieldName, $numTokens);
/**
* Computes the normalization value for a query given the sum of the squared
* weights of each of the query terms. This value is then multipled into the
* weight of each query term.
*
* This does not affect ranking, but rather just attempts to make scores
* from different queries comparable.
*
* sumOfSquaredWeights - the sum of the squares of query term weights
* Returns a normalization factor for query weights
*
* @param float $sumOfSquaredWeights
* @return float
*/
abstract public function queryNorm($sumOfSquaredWeights);
/**
* Decodes a normalization factor stored in an index.
*
* @param integer $byte
* @return float
*/
public static function decodeNorm($byte)
{
return self::$_normTable[$byte & 0xFF];
}
/**
* Encodes a normalization factor for storage in an index.
*
* The encoding uses a five-bit exponent and three-bit mantissa, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
*
* @param float $f
* @return integer
*/
static function encodeNorm($f)
{
return self::_floatToByte($f);
}
/**
* Float to byte conversion
*
* @param integer $b
* @return float
*/
private static function _floatToByte($f)
{
// round negatives up to zero
if ($f <= 0.0) {
return 0;
}
// search for appropriate value
$lowIndex = 0;
$highIndex = 255;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$delta = $f - self::$_normTable[$mid];
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
return $mid; // We got it!
}
}
// round to closest value
if ($highIndex != 255 &&
$f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
return $highIndex + 1;
} else {
return $highIndex;
}
}
/**
* Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the idf(Term, Searcher)
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when 'freq' is large, and smaller values when 'freq'
* is small.
*
* freq - the frequency of a term within a document
* Returns a score factor based on a term's within-document frequency
*
* @param float $freq
* @return float
*/
abstract public function tf($freq);
/**
* Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to tf(float).
*
* A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* distance - the edit distance of this sloppy phrase match
* Returns the frequency increment for this match
*
* @param integer $distance
* @return float
*/
abstract public function sloppyFreq($distance);
/**
* Computes a score factor for a simple term or a phrase.
*
* The default implementation is:
* return idfFreq(searcher.docFreq(term), searcher.maxDoc());
*
* input - the term in question or array of terms
* reader - reader the document collection being searched
* Returns a score factor for the term
*
* @param mixed $input
* @param Zend_Search_Lucene_Interface $reader
* @return a score factor for the term
*/
public function idf($input, Zend_Search_Lucene_Interface $reader)
{
if (!is_array($input)) {
return $this->idfFreq($reader->docFreq($input), $reader->count());
} else {
$idf = 0.0;
foreach ($input as $term) {
$idf += $this->idfFreq($reader->docFreq($term), $reader->count());
}
return $idf;
}
}
/**
* Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* tf(int) factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* Terms that occur in fewer documents are better indicators of topic, so
* implemenations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* docFreq - the number of documents which contain the term
* numDocs - the total number of documents in the collection
* Returns a score factor based on the term's document frequency
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
abstract public function idfFreq($docFreq, $numDocs);
/**
* Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*
* The presence of a large portion of the query terms indicates a better
* match with the query, so implemenations of this method usually return
* larger values when the ratio between these parameters is large and smaller
* values when the ratio between them is small.
*
* overlap - the number of query terms matched in the document
* maxOverlap - the total number of terms in the query
* Returns a score factor based on term overlap with the query
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
abstract public function coord($overlap, $maxOverlap);
}
Weight.php 0000666 00000004575 15125175533 0006532 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Calculate query weights and build query scorers.
*
* A Weight is constructed by a query Query->createWeight().
* The sumOfSquaredWeights() method is then called on the top-level
* query to compute the query normalization factor Similarity->queryNorm(float).
* This factor is then passed to normalize(float). At this point the weighting
* is complete.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Weight
{
/**
* Normalization factor.
* This value is stored only for query expanation purpose and not used in any other place
*
* @var float
*/
protected $_queryNorm;
/**
* Weight value
*
* Weight value may be initialized in sumOfSquaredWeights() or normalize()
* because they both are invoked either in Query::_initWeight (for top-level query) or
* in corresponding methods of parent query's weights
*
* @var float
*/
protected $_value;
/**
* The weight for this query.
*
* @return float
*/
public function getValue()
{
return $this->_value;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
abstract public function sumOfSquaredWeights();
/**
* Assigns the query normalization factor to this.
*
* @param $norm
*/
abstract public function normalize($norm);
}
Query.php 0000666 00000014165 15125175533 0006404 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Document_Html */
require_once 'Zend/Search/Lucene/Document/Html.php';
/** Zend_Search_Lucene_Index_DocsFilter */
require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Query
{
/**
* query boost factor
*
* @var float
*/
private $_boost = 1;
/**
* Query weight
*
* @var Zend_Search_Lucene_Search_Weight
*/
protected $_weight = null;
/**
* Current highlight color
*
* @var integer
*/
private $_currentColorIndex = 0;
/**
* List of colors for text highlighting
*
* @var array
*/
private $_highlightColors = array('#66ffff', '#ff66ff', '#ffff66',
'#ff8888', '#88ff88', '#8888ff',
'#88dddd', '#dd88dd', '#dddd88',
'#aaddff', '#aaffdd', '#ddaaff', '#ddffaa', '#ffaadd', '#ffddaa');
/**
* Gets the boost for this clause. Documents matching
* this clause will (in addition to the normal weightings) have their score
* multiplied by boost. The boost is 1.0 by default.
*
* @return float
*/
public function getBoost()
{
return $this->_boost;
}
/**
* Sets the boost for this query clause to $boost.
*
* @param float $boost
*/
public function setBoost($boost)
{
$this->_boost = $boost;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
abstract public function score($docId, Zend_Search_Lucene_Interface $reader);
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
abstract public function matchedDocs();
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* Query specific implementation
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
abstract public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null);
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
abstract public function createWeight(Zend_Search_Lucene_Interface $reader);
/**
* Constructs an initializes a Weight for a _top-level_query_.
*
* @param Zend_Search_Lucene_Interface $reader
*/
protected function _initWeight(Zend_Search_Lucene_Interface $reader)
{
// Check, that it's a top-level query and query weight is not initialized yet.
if ($this->_weight !== null) {
return $this->_weight;
}
$this->createWeight($reader);
$sum = $this->_weight->sumOfSquaredWeights();
$queryNorm = $reader->getSimilarity()->queryNorm($sum);
$this->_weight->normalize($queryNorm);
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
abstract public function rewrite(Zend_Search_Lucene_Interface $index);
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
abstract public function optimize(Zend_Search_Lucene_Interface $index);
/**
* Reset query, so it can be reused within other queries or
* with other indeces
*/
public function reset()
{
$this->_weight = null;
}
/**
* Print a query
*
* @return string
*/
abstract public function __toString();
/**
* Return query terms
*
* @return array
*/
abstract public function getQueryTerms();
/**
* Get highlight color and shift to next
*
* @param integer &$colorIndex
* @return string
*/
protected function _getHighlightColor(&$colorIndex)
{
$color = $this->_highlightColors[$colorIndex++];
$colorIndex %= count($this->_highlightColors);
return $color;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
abstract public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex);
/**
* Highlight matches in $inputHTML
*
* @param string $inputHTML
* @return string
*/
public function highlightMatches($inputHTML)
{
$doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
$colorIndex = 0;
$this->highlightMatchesDOM($doc, $colorIndex);
return $doc->getHTML();
}
}
BooleanExpressionRecognizer.php 0000666 00000022411 15125175533 0012757 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/** Zend_Search_Lucene_Search_QueryToken */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_BooleanExpressionRecognizer extends Zend_Search_Lucene_FSM
{
/** State Machine states */
const ST_START = 0;
const ST_LITERAL = 1;
const ST_NOT_OPERATOR = 2;
const ST_AND_OPERATOR = 3;
const ST_OR_OPERATOR = 4;
/** Input symbols */
const IN_LITERAL = 0;
const IN_NOT_OPERATOR = 1;
const IN_AND_OPERATOR = 2;
const IN_OR_OPERATOR = 3;
/**
* NOT operator signal
*
* @var boolean
*/
private $_negativeLiteral = false;
/**
* Current literal
*
* @var mixed
*/
private $_literal;
/**
* Set of boolean query conjunctions
*
* Each conjunction is an array of conjunction elements
* Each conjunction element is presented with two-elements array:
* array(<literal>, <is_negative>)
*
* So, it has a structure:
* array( array( array(<literal>, <is_negative>), // first literal of first conjuction
* array(<literal>, <is_negative>), // second literal of first conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of first conjuction
* array( array(<literal>, <is_negative>), // first literal of second conjuction
* array(<literal>, <is_negative>), // second literal of second conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of second conjuction
* ...
* ) // end of structure
*
* @var array
*/
private $_conjunctions = array();
/**
* Current conjuction
*
* @var array
*/
private $_currentConjunction = array();
/**
* Object constructor
*/
public function __construct()
{
parent::__construct( array(self::ST_START,
self::ST_LITERAL,
self::ST_NOT_OPERATOR,
self::ST_AND_OPERATOR,
self::ST_OR_OPERATOR),
array(self::IN_LITERAL,
self::IN_NOT_OPERATOR,
self::IN_AND_OPERATOR,
self::IN_OR_OPERATOR));
$emptyOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'emptyOperatorAction');
$emptyNotOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'emptyNotOperatorAction');
$this->addRules(array( array(self::ST_START, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_START, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
array(self::ST_LITERAL, self::IN_AND_OPERATOR, self::ST_AND_OPERATOR),
array(self::ST_LITERAL, self::IN_OR_OPERATOR, self::ST_OR_OPERATOR),
array(self::ST_LITERAL, self::IN_LITERAL, self::ST_LITERAL, $emptyOperatorAction),
array(self::ST_LITERAL, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR, $emptyNotOperatorAction),
array(self::ST_NOT_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_AND_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_AND_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
array(self::ST_OR_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_OR_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
));
$notOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'notOperatorAction');
$orOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'orOperatorAction');
$literalAction = new Zend_Search_Lucene_FSMAction($this, 'literalAction');
$this->addEntryAction(self::ST_NOT_OPERATOR, $notOperatorAction);
$this->addEntryAction(self::ST_OR_OPERATOR, $orOperatorAction);
$this->addEntryAction(self::ST_LITERAL, $literalAction);
}
/**
* Process next operator.
*
* Operators are defined by class constants: IN_AND_OPERATOR, IN_OR_OPERATOR and IN_NOT_OPERATOR
*
* @param integer $operator
*/
public function processOperator($operator)
{
$this->process($operator);
}
/**
* Process expression literal.
*
* @param integer $operator
*/
public function processLiteral($literal)
{
$this->_literal = $literal;
$this->process(self::IN_LITERAL);
}
/**
* Finish an expression and return result
*
* Result is a set of boolean query conjunctions
*
* Each conjunction is an array of conjunction elements
* Each conjunction element is presented with two-elements array:
* array(<literal>, <is_negative>)
*
* So, it has a structure:
* array( array( array(<literal>, <is_negative>), // first literal of first conjuction
* array(<literal>, <is_negative>), // second literal of first conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of first conjuction
* array( array(<literal>, <is_negative>), // first literal of second conjuction
* array(<literal>, <is_negative>), // second literal of second conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of second conjuction
* ...
* ) // end of structure
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function finishExpression()
{
if ($this->getState() != self::ST_LITERAL) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Literal expected.');
}
$this->_conjunctions[] = $this->_currentConjunction;
return $this->_conjunctions;
}
/*********************************************************************
* Actions implementation
*********************************************************************/
/**
* default (omitted) operator processing
*/
public function emptyOperatorAction()
{
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
// Do nothing
} else {
$this->orOperatorAction();
}
// Process literal
$this->literalAction();
}
/**
* default (omitted) + NOT operator processing
*/
public function emptyNotOperatorAction()
{
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
// Do nothing
} else {
$this->orOperatorAction();
}
// Process NOT operator
$this->notOperatorAction();
}
/**
* NOT operator processing
*/
public function notOperatorAction()
{
$this->_negativeLiteral = true;
}
/**
* OR operator processing
* Close current conjunction
*/
public function orOperatorAction()
{
$this->_conjunctions[] = $this->_currentConjunction;
$this->_currentConjunction = array();
}
/**
* Literal processing
*/
public function literalAction()
{
// Add literal to the current conjunction
$this->_currentConjunction[] = array($this->_literal, !$this->_negativeLiteral);
// Switch off negative signal
$this->_negativeLiteral = false;
}
}
Query/Empty.php 0000666 00000007334 15125175533 0007502 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Empty */
require_once 'Zend/Search/Lucene/Search/Weight/Empty.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Empty extends Zend_Search_Lucene_Search_Query
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// "Empty" query is a primitive query and don't need to be optimized
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
return new Zend_Search_Lucene_Search_Weight_Empty();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<EmptyQuery>';
}
}
Query/MultiTerm.php 0000666 00000047532 15125175533 0010332 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term
*
* @var array
*/
private $_terms = array();
/**
* Term signs.
* If true then term is required.
* If false then term is prohibited.
* If null then term is neither prohibited, nor required
*
* If array is null then all terms are required
*
* @var array
*/
private $_signs;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => freq, ...)
* term2Id => (docId => freq, ...)
*
* @var array
*/
private $_termsFreqs = array();
/**
* A score factor based on the fraction of all query terms
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Terms weights
* array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights = array();
/**
* Class constructor. Create a new multi-term query object.
*
* if $signs array is omitted then all terms are required
* it differs from addTerm() behavior, but should never be used
*
* @param array $terms Array of Zend_Search_Lucene_Index_Term objects
* @param array $signs Array of signs. Sign is boolean|null.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $signs = null)
{
if (is_array($terms)) {
if (count($terms) > Zend_search_lucene::getTermsPerQueryLimit()) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$this->_terms = $terms;
$this->_signs = null;
// Check if all terms are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $term (Zend_Search_Lucene_Index_Term) to this query.
*
* The sign is specified as:
* TRUE - term is required
* FALSE - term is prohibited
* NULL - term is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean|null $sign
* @return void
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) {
if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required
if ($this->_signs === null) { // Check, If all previous terms are required
$this->_signs = array();
foreach ($this->_terms as $prevTerm) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_terms[] = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if (count($this->_terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check, that all fields are qualified
$allQualified = true;
foreach ($this->_terms as $term) {
if ($term->field === null) {
$allQualified = false;
break;
}
}
if ($allQualified) {
return $this;
} else {
/** transform multiterm query to boolean and apply rewrite() method to subqueries. */
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($this->_terms as $termId => $term) {
$subquery = new Zend_Search_Lucene_Search_Query_Term($term);
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$termId]);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
$terms = $this->_terms;
$signs = $this->_signs;
foreach ($terms as $id => $term) {
if (!$index->hasTerm($term)) {
if ($signs === null || $signs[$id] === true) {
// Term is required
return new Zend_Search_Lucene_Search_Query_Empty();
} else {
// Term is optional or prohibited
// Remove it from terms and signs list
unset($terms[$id]);
unset($signs[$id]);
}
}
}
// Check if all presented terms are prohibited
$allProhibited = true;
if ($signs === null) {
$allProhibited = false;
} else {
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
}
if ($allProhibited) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
/**
* @todo make an optimization for repeated terms
* (they may have different signs)
*/
if (count($terms) == 1) {
// It's already checked, that it's not a prohibited term
// It's one term query with one required or optional element
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
$optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs);
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Return terms signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '+something +another')
*
* @param Zend_Search_Lucene_Interface $reader
*/
private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
// Order terms by selectivity
$docFreqs = array();
$ids = array();
foreach ($this->_terms as $id => $term) {
$docFreqs[] = $reader->docFreq($term);
$ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
}
array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
$ids, SORT_ASC, SORT_NUMERIC,
$this->_terms);
$docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
foreach ($this->_terms as $termId => $term) {
$termDocs = $reader->termDocs($term, $docsFilter);
}
// Treat last retrieved docs vector as a result set
// (filter collects data for other terms)
$this->_resVector = array_flip($termDocs);
foreach ($this->_terms as $termId => $term) {
$this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithms doesn't change elements order.
}
/**
* Calculate result vector for non Conjunction query
* (like '+something -another')
*
* @param Zend_Search_Lucene_Interface $reader
*/
private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader)
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
$prohibited = array();
foreach ($this->_terms as $termId => $term) {
$termDocs = array_flip($reader->termDocs($term));
if ($this->_signs[$termId] === true) {
// required
$requiredVectors[] = $termDocs;
$requiredVectorsSizes[] = count($termDocs);
$requiredVectorsIds[] = $termId;
} elseif ($this->_signs[$termId] === false) {
// prohibited
// array union
$prohibited += $termDocs;
} else {
// neither required, nor prohibited
// array union
$optional += $termDocs;
}
$this->_termsFreqs[$termId] = $reader->termFreqs($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = $required;
} else {
$this->_resVector = $optional;
}
if (count($prohibited) != 0) {
// $this->_resVector = array_diff_key($this->_resVector, $prohibited);
/**
* This code is used as workaround for array_diff_key() slowness problem.
*/
if (count($this->_resVector) < count($prohibited)) {
$updatedVector = $this->_resVector;
foreach ($this->_resVector as $id => $value) {
if (isset($prohibited[$id])) {
unset($updatedVector[$id]);
}
}
$this->_resVector = $updatedVector;
} else {
$updatedVector = $this->_resVector;
foreach ($prohibited as $id => $value) {
unset($updatedVector[$id]);
}
$this->_resVector = $updatedVector;
}
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
count($this->_terms) );
}
$score = 0.0;
foreach ($this->_terms as $termId => $term) {
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _nonConjunctionScore($docId, $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0.0;
$matchedTerms = 0;
foreach ($this->_terms as $termId=>$term) {
// Check if term is
if ($this->_signs[$termId] !== false && // not prohibited
isset($this->_termsFreqs[$termId][$docId]) // matched
) {
$matchedTerms++;
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score +=
$reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
}
return $score * $this->_coord[$matchedTerms] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
if ($this->_signs === null) {
$this->_calculateConjunctionResult($reader);
} else {
$this->_calculateNonConjunctionResult($reader);
}
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
if ($this->_signs === null) {
return $this->_terms;
}
$terms = array();
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$terms[] = $this->_terms[$id];
}
}
return $terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
if ($this->_signs === null) {
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
} else {
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$words[] = $this->_terms[$id]->text;
}
}
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} else if ($this->_signs[$id] === false) {
$query .= '-';
}
if ($term->field !== null) {
$query .= $term->field . ':';
}
$query .= $term->text;
}
if ($this->getBoost() != 1) {
$query = '(' . $query . ')^' . $this->getBoost();
}
return $query;
}
}
Query/Range.php 0000666 00000024544 15125175533 0007442 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Range extends Zend_Search_Lucene_Search_Query
{
/**
* Lower term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_lowerTerm;
/**
* Upper term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_upperTerm;
/**
* Search field
*
* @var string
*/
private $_field;
/**
* Inclusive
*
* @var boolean
*/
private $_inclusive;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches;
/**
* Zend_Search_Lucene_Search_Query_Range constructor.
*
* @param Zend_Search_Lucene_Index_Term|null $lowerTerm
* @param Zend_Search_Lucene_Index_Term|null $upperTerm
* @param boolean $inclusive
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($lowerTerm, $upperTerm, $inclusive)
{
if ($lowerTerm === null && $upperTerm === null) {
throw new Zend_Search_Lucene_Exception('At least one term must be non-null');
}
if ($lowerTerm !== null && $upperTerm !== null && $lowerTerm->field != $upperTerm->field) {
throw new Zend_Search_Lucene_Exception('Both terms must be for the same field');
}
$this->_field = ($lowerTerm !== null)? $lowerTerm->field : $upperTerm->field;
$this->_lowerTerm = $lowerTerm;
$this->_upperTerm = $upperTerm;
$this->_inclusive = $inclusive;
}
/**
* Get query field name
*
* @return string|null
*/
public function getField()
{
return $this->_field;
}
/**
* Get lower term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function getLowerTerm()
{
return $this->_lowerTerm;
}
/**
* Get upper term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function getUpperTerm()
{
return $this->_upperTerm;
}
/**
* Get upper term
*
* @return boolean
*/
public function isInclusive()
{
return $this->_inclusive;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
if ($this->_field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_field);
}
$maxTerms = Zend_search_lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($this->_lowerTerm !== null) {
$lowerTerm = new Zend_Search_Lucene_Index_Term($this->_lowerTerm->text, $field);
$index->skipTo($lowerTerm);
if (!$this->_inclusive &&
$index->currentTerm() == $lowerTerm) {
// Skip lower term
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
}
if ($this->_upperTerm !== null) {
// Walk up to the upper term
$upperTerm = new Zend_Search_Lucene_Index_Term($this->_upperTerm->text, $field);
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
$index->currentTerm()->text < $upperTerm->text) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$index->nextTerm();
}
if ($this->_inclusive && $index->currentTerm() == $upperTerm) {
// Include upper term into result
$this->_matches[] = $upperTerm;
}
} else {
// Walk up to the end of field data
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
foreach ($this->_matches as $term) {
$words[] = $term->text;
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_field === null)? '' : $this->_field . ':')
. (($this->_inclusive)? '[' : '{')
. (($this->_lowerTerm !== null)? $this->_lowerTerm->text : 'null')
. ' TO '
. (($this->_upperTerm !== null)? $this->_upperTerm->text : 'null')
. (($this->_inclusive)? ']' : '}');
}
}
Query/Term.php 0000666 00000014063 15125175533 0007310 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Term */
require_once 'Zend/Search/Lucene/Search/Weight/Term.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query
{
/**
* Term to find.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* Documents vector.
*
* @var array
*/
private $_docVector = null;
/**
* Term freqs vector.
* array(docId => freq, ...)
*
* @var array
*/
private $_termFreqs;
/**
* Zend_Search_Lucene_Search_Query_Term constructor
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean $sign
*/
public function __construct(Zend_Search_Lucene_Index_Term $term)
{
$this->_term = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if ($this->_term->field != null) {
return $this;
} else {
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$term = new Zend_Search_Lucene_Index_Term($this->_term->text, $fieldName);
$query->addTerm($term);
}
return $query->rewrite($index);
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// Check, that index contains specified term
if (!$index->hasTerm($this->_term)) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader);
return $this->_weight;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
$this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter));
$this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter);
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_docVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_docVector[$docId])) {
return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) *
$this->_weight->getValue() *
$reader->norm($docId, $this->_term->field) *
$this->getBoost();
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array($this->_term);
}
/**
* Return query term
*
* @return Zend_Search_Lucene_Index_Term
*/
public function getTerm()
{
return $this->_term;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$doc->highlight($this->_term->text, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_term->field === null)? '':$this->_term->field . ':') . $this->_term->text;
}
}
Query/Phrase.php 0000666 00000043011 15125175533 0007616 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene_Search_Query
*/
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm
*/
require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php';
/**
* A Query that matches documents containing a particular sequence of terms.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term objects.
*
* @var array
*/
private $_terms;
/**
* Term positions (relative positions of terms within the phrase).
* Array of integers
*
* @var array
*/
private $_offsets;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var integer
*/
private $_slop;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => array( pos1, pos2, ... ), ...)
* term2Id => (docId => array( pos1, pos2, ... ), ...)
*
* @var array
*/
private $_termsPositions = array();
/**
* Class constructor. Create a new prase query.
*
* @param string $field Field to search.
* @param array $terms Terms to search Array of strings.
* @param array $offsets Relative term positions. Array of integers.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $offsets = null, $field = null)
{
$this->_slop = 0;
if (is_array($terms)) {
$this->_terms = array();
foreach ($terms as $termId => $termText) {
$this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
new Zend_Search_Lucene_Index_Term($termText);
}
} else if ($terms === null) {
$this->_terms = array();
} else {
throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
}
if (is_array($offsets)) {
if (count($this->_terms) != count($offsets)) {
throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
}
$this->_offsets = $offsets;
} else if ($offsets === null) {
$this->_offsets = array();
foreach ($this->_terms as $termId => $term) {
$position = count($this->_offsets);
$this->_offsets[$termId] = $position;
}
} else {
throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
}
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Adds a term to the end of the query phrase.
* The relative position of the term is specified explicitly or the one immediately
* after the last term added.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $position
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
$term->field . ':' . $term->text);
}
$this->_terms[] = $term;
if ($position !== null) {
$this->_offsets[] = $position;
} else if (count($this->_offsets) != 0) {
$this->_offsets[] = end($this->_offsets) + 1;
} else {
$this->_offsets[] = 0;
}
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if (count($this->_terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if ($this->_terms[0]->field !== null) {
return $this;
} else {
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$subquery = new Zend_Search_Lucene_Search_Query_Phrase();
$subquery->setSlop($this->getSlop());
foreach ($this->_terms as $termId => $term) {
$qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName);
$subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
}
$query->addSubquery($subquery);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// Check, that index contains all phrase terms
foreach ($this->_terms as $term) {
if (!$index->hasTerm($term)) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
}
if (count($this->_terms) == 1) {
// It's one term query
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($this->_terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
return $this;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
return $this->_weight;
}
/**
* Score calculator for exact phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @return float
*/
public function _exactPhraseFreq($docId)
{
$freq = 0;
// Term Id with lowest cardinality
$lowCardTermId = null;
// Calculate $lowCardTermId
foreach ($this->_terms as $termId => $term) {
if ($lowCardTermId === null ||
count($this->_termsPositions[$termId][$docId]) <
count($this->_termsPositions[$lowCardTermId][$docId]) ) {
$lowCardTermId = $termId;
}
}
// Walk through positions of the term with lowest cardinality
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
// We expect phrase to be found
$freq++;
// Walk through other terms
foreach ($this->_terms as $termId => $term) {
if ($termId != $lowCardTermId) {
$expectedPosition = $lowCardPos +
($this->_offsets[$termId] -
$this->_offsets[$lowCardTermId]);
if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
$freq--; // Phrase wasn't found.
break;
}
}
}
}
return $freq;
}
/**
* Score calculator for sloppy phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader)
{
$freq = 0;
$phraseQueue = array();
$phraseQueue[0] = array(); // empty phrase
$lastTerm = null;
// Walk through the terms to create phrases.
foreach ($this->_terms as $termId => $term) {
$queueSize = count($phraseQueue);
$firstPass = true;
// Walk through the term positions.
// Each term position produces a set of phrases.
foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
if ($firstPass) {
for ($count = 0; $count < $queueSize; $count++) {
$phraseQueue[$count][$termId] = $termPosition;
}
} else {
for ($count = 0; $count < $queueSize; $count++) {
if ($lastTerm !== null &&
abs( $termPosition - $phraseQueue[$count][$lastTerm] -
($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
continue;
}
$newPhraseId = count($phraseQueue);
$phraseQueue[$newPhraseId] = $phraseQueue[$count];
$phraseQueue[$newPhraseId][$termId] = $termPosition;
}
}
$firstPass = false;
}
$lastTerm = $termId;
}
foreach ($phraseQueue as $phrasePos) {
$minDistance = null;
for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
$distance = 0;
$start = reset($phrasePos) - reset($this->_offsets) + $shift;
foreach ($this->_terms as $termId => $term) {
$distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
if($distance > $this->_slop) {
break;
}
}
if ($minDistance === null || $distance < $minDistance) {
$minDistance = $distance;
}
}
if ($minDistance <= $this->_slop) {
$freq += $reader->getSimilarity()->sloppyFreq($minDistance);
}
}
return $freq;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_terms as $termId => $term) {
$resVectors[] = array_flip($reader->termDocs($term));
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $termId;
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithm doesn't change elements order.
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_slop == 0) {
$freq = $this->_exactPhraseFreq($docId);
} else {
$freq = $this->_sloppyPhraseFreq($docId, $reader);
}
if ($freq != 0) {
$tf = $reader->getSimilarity()->tf($freq);
$weight = $this->_weight->getValue();
$norm = $reader->norm($docId, reset($this->_terms)->field);
return $tf * $weight * $norm * $this->getBoost();
}
// Included in result, but culculated freq is zero
return 0;
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return $this->_terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
$query .= $this->_terms[0]->field . ':';
}
$query .= '"';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
$query .= $term->text;
}
$query .= '"';
if ($this->_slop != 0) {
$query .= '~' . $this->_slop;
}
return $query;
}
}
Query/Fuzzy.php 0000666 00000037233 15125175533 0007534 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Fuzzy extends Zend_Search_Lucene_Search_Query
{
/** Default minimum similarity */
const DEFAULT_MIN_SIMILARITY = 0.5;
/**
* Maximum number of matched terms.
* Apache Lucene defines this limitation as boolean query maximum number of clauses:
* org.apache.lucene.search.BooleanQuery.getMaxClauseCount()
*/
const MAX_CLAUSE_COUNT = 1024;
/**
* Array of precalculated max distances
*
* keys are integers representing a word size
*/
private $_maxDistances = array();
/**
* Base searching term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* A value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* _minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* @var float
*/
private $_minimumSimilarity;
/**
* The length of common (non-fuzzy) prefix
*
* @var integer
*/
private $_prefixLength;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Matched terms scores
*
* @var array
*/
private $_scores = null;
/**
* Array of the term keys.
* Used to sort terms in alphabetical order if terms have the same socres
*
* @var array
*/
private $_termKeys = null;
/**
* Default non-fuzzy prefix length
*
* @var integer
*/
private static $_defaultPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param Zend_Search_Lucene_Index_Term $pattern
* @throws Zend_Search_Lucene_Exception
*/
public function __construct(Zend_Search_Lucene_Index_Term $term, $minimumSimilarity = self::DEFAULT_MIN_SIMILARITY, $prefixLength = null)
{
if ($minimumSimilarity < 0) {
throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be less than 0');
}
if ($minimumSimilarity >= 1) {
throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be greater than or equal to 1');
}
if ($prefixLength < 0) {
throw new Zend_Search_Lucene_Exception('prefixLength cannot be less than 0');
}
$this->_term = $term;
$this->_minimumSimilarity = $minimumSimilarity;
$this->_prefixLength = ($prefixLength !== null)? $prefixLength : self::$_defaultPrefixLength;
}
/**
* Get default non-fuzzy prefix length
*
* @return integer
*/
public static function getDefaultPrefixLength()
{
return self::$_defaultPrefixLength;
}
/**
* Set default non-fuzzy prefix length
*
* @param integer $defaultPrefixLength
*/
public static function setDefaultPrefixLength($defaultPrefixLength)
{
self::$_defaultPrefixLength = $defaultPrefixLength;
}
/**
* Calculate maximum distance for specified word length
*
* @param integer $prefixLength
* @param integer $termLength
* @param integer $length
* @return integer
*/
private function _calculateMaxDistance($prefixLength, $termLength, $length)
{
$this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength));
return $this->_maxDistances[$length];
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Exception
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
$this->_scores = array();
$this->_termKeys = array();
if ($this->_term->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_term->field);
}
$prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
$prefixByteLength = strlen($prefix);
$prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
$termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
$termRest = substr($this->_term->text, $prefixByteLength);
// we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
$termRestLength = strlen($termRest);
$scaleFactor = 1/(1 - $this->_minimumSimilarity);
$maxTerms = Zend_search_lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($prefix != '') {
$index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
// Calculate similarity
$target = substr($index->currentTerm()->text, $prefixByteLength);
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
if ($termRestLength == 0) {
// we don't have anything to compare. That means if we just add
// the letters for current term we get the new word
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
} else if (strlen($target) == 0) {
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
} else if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
// Calculate similarity
$target = $index->currentTerm()->text;
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance(0, $termRestLength, strlen($target));
if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_Boolean();
array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC,
$this->_termKeys, SORT_ASC, SORT_STRING,
$this->_matches);
$termCount = 0;
foreach ($this->_matches as $id => $matchedTerm) {
$subquery = new Zend_Search_Lucene_Search_Query_Term($matchedTerm);
$subquery->setBoost($this->_scores[$id]);
$rewrittenQuery->addSubquery($subquery);
$termCount++;
if ($termCount >= self::MAX_CLAUSE_COUNT) {
break;
}
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
foreach ($this->_matches as $term) {
$words[] = $term->text;
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_term->field === null)? '' : $this->_term->field . ':')
. $this->_term->text . '~'
. (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : '');
}
}
Query/Insignificant.php 0000666 00000007414 15125175533 0011170 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Empty */
require_once 'Zend/Search/Lucene/Search/Weight/Empty.php';
/**
* The insignificant query returns empty result, but doesn't limit result set as a part of other queries
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Insignificant extends Zend_Search_Lucene_Search_Query
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
return new Zend_Search_Lucene_Search_Weight_Empty();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<InsignificantQuery>';
}
}
Query/Wildcard.php 0000666 00000025623 15125175533 0010136 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Wildcard extends Zend_Search_Lucene_Search_Query
{
/**
* Search pattern.
*
* Field has to be fully specified or has to be null
* Text may contain '*' or '?' symbols
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_pattern;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Minimum term prefix length (number of minimum non-wildcard characters)
*
* @var integer
*/
private static $_minPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param Zend_Search_Lucene_Index_Term $pattern
*/
public function __construct(Zend_Search_Lucene_Index_Term $pattern)
{
$this->_pattern = $pattern;
}
/**
* Get minimum prefix length
*
* @return integer
*/
public static function getMinPrefixLength()
{
return self::$_minPrefixLength;
}
/**
* Set minimum prefix length
*
* @param integer $minPrefixLength
*/
public static function setMinPrefixLength($minPrefixLength)
{
self::$_minPrefixLength = $minPrefixLength;
}
/**
* Get terms prefix
*
* @param string $word
* @return string
*/
private static function _getPrefix($word)
{
$questionMarkPosition = strpos($word, '?');
$astrericPosition = strpos($word, '*');
if ($questionMarkPosition !== false) {
if ($astrericPosition !== false) {
return substr($word, 0, min($questionMarkPosition, $astrericPosition));
}
return substr($word, 0, $questionMarkPosition);
} else if ($astrericPosition !== false) {
return substr($word, 0, $astrericPosition);
}
return $word;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Exception
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
if ($this->_pattern->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_pattern->field);
}
$prefix = self::_getPrefix($this->_pattern->text);
$prefixLength = strlen($prefix);
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if ($prefixLength < self::$_minPrefixLength) {
throw new Zend_Search_Lucene_Exception('At least ' . self::$_minPrefixLength . ' non-wildcard terms are required.');
}
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$maxTerms = Zend_search_lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($prefix != '') {
$index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Returns query pattern
*
* @return Zend_Search_Lucene_Index_Term
*/
public function getPattern()
{
return $this->_pattern;
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if (@preg_match('/\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8');
foreach ($tokens as $token) {
if (preg_match($matchExpression, $token->getTermText()) === 1) {
$words[] = $token->getTermText();
}
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_pattern->field === null)? '' : $this->_pattern->field . ':') . $this->_pattern->text;
}
}
Query/Boolean.php 0000666 00000065650 15125175533 0007770 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Boolean */
require_once 'Zend/Search/Lucene/Search/Weight/Boolean.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Boolean extends Zend_Search_Lucene_Search_Query
{
/**
* Subqueries
* Array of Zend_Search_Lucene_Search_Query
*
* @var array
*/
private $_subqueries = array();
/**
* Subqueries signs.
* If true then subquery is required.
* If false then subquery is prohibited.
* If null then subquery is neither prohibited, nor required
*
* If array is null then all subqueries are required
*
* @var array
*/
private $_signs = array();
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* A score factor based on the fraction of all query subqueries
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Class constructor. Create a new Boolean query object.
*
* if $signs array is omitted then all subqueries are required
* it differs from addSubquery() behavior, but should never be used
*
* @param array $subqueries Array of Zend_Search_Search_Query objects
* @param array $signs Array of signs. Sign is boolean|null.
* @return void
*/
public function __construct($subqueries = null, $signs = null)
{
if (is_array($subqueries)) {
$this->_subqueries = $subqueries;
$this->_signs = null;
// Check if all subqueries are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $subquery (Zend_Search_Lucene_Search_Query) to this query.
*
* The sign is specified as:
* TRUE - subquery is required
* FALSE - subquery is prohibited
* NULL - subquery is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Search_Query $subquery
* @param boolean|null $sign
* @return void
*/
public function addSubquery(Zend_Search_Lucene_Search_Query $subquery, $sign=null) {
if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required
if ($this->_signs === null) { // Check, If all previous subqueries are required
$this->_signs = array();
foreach ($this->_subqueries as $prevSubquery) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_subqueries[] = $subquery;
}
/**
* Re-write queries into primitive queries
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($this->_subqueries as $subqueryId => $subquery) {
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$subqueryId]);
}
return $query;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
$subqueries = array();
$signs = array();
// Optimize all subqueries
foreach ($this->_subqueries as $id => $subquery) {
$subqueries[] = $subquery->optimize($index);
$signs[] = ($this->_signs === null)? true : $this->_signs[$id];
}
// Remove insignificant subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
// Insignificant subquery has to be removed anyway
unset($subqueries[$id]);
unset($signs[$id]);
}
}
if (count($subqueries) == 0) {
// Boolean query doesn't has non-insignificant subqueries
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
// Check if all non-insignificant subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
// Check for empty subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Empty) {
if ($signs[$id] === true) {
// Matching is required, but is actually empty
return new Zend_Search_Lucene_Search_Query_Empty();
} else {
// Matching is optional or prohibited, but is empty
// Remove it from subqueries and signs list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if reduced subqueries list is empty
if (count($subqueries) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check if all non-empty subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check, if reduced subqueries list has only one entry
if (count($subqueries) == 1) {
// It's a query with only one required or optional clause
// (it's already checked, that it's not a prohibited clause)
if ($this->getBoost() == 1) {
return reset($subqueries);
}
$optimizedQuery = clone reset($subqueries);
$optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost());
return $optimizedQuery;
}
// Prepare first candidate for optimized query
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
$terms = array();
$tsigns = array();
$boostFactors = array();
// Try to decompose term and multi-term subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Term) {
$terms[] = $subquery->getTerm();
$tsigns[] = $signs[$id];
$boostFactors[] = $subquery->getBoost();
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else if ($subquery instanceof Zend_Search_Lucene_Search_Query_MultiTerm) {
$subTerms = $subquery->getTerms();
$subSigns = $subquery->getSigns();
if ($signs[$id] === true) {
// It's a required multi-term subquery.
// Something like '... +(+term1 -term2 term3 ...) ...'
// Multi-term required subquery can be decomposed only if it contains
// required terms and doesn't contain prohibited terms:
// ... +(+term1 term2 ...) ... => ... +term1 term2 ...
//
// Check this
$hasRequired = false;
$hasProhibited = false;
if ($subSigns === null) {
// All subterms are required
$hasRequired = true;
} else {
foreach ($subSigns as $sign) {
if ($sign === true) {
$hasRequired = true;
} else if ($sign === false) {
$hasProhibited = true;
break;
}
}
}
// Continue if subquery has prohibited terms or doesn't have required terms
if ($hasProhibited || !$hasRequired) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($subSigns === null)? true : $subSigns[$termId];
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else { // $signs[$id] === null || $signs[$id] === false
// It's an optional or prohibited multi-term subquery.
// Something like '... (+term1 -term2 term3 ...) ...'
// or
// something like '... -(+term1 -term2 term3 ...) ...'
// Multi-term optional and required subqueries can be decomposed
// only if all terms are optional.
//
// Check if all terms are optional.
$onlyOptional = true;
if ($subSigns === null) {
// All subterms are required
$onlyOptional = false;
} else {
foreach ($subSigns as $sign) {
if ($sign !== null) {
$onlyOptional = false;
break;
}
}
}
// Continue if non-optional terms are presented in this multi-term subquery
if (!$onlyOptional) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($signs[$id] === null)? null /* optional */ :
false /* prohibited */;
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if there are no decomposed subqueries
if (count($terms) == 0 ) {
// return prepared candidate
return $optimizedQuery;
}
// Check, if all subqueries have been decomposed and all terms has the same boost factor
if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) {
$optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
$optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost());
return $optimizedQuery;
}
// This boolean query can't be transformed to Term/MultiTerm query and still contains
// several subqueries
// Separate prohibited terms
$prohibitedTerms = array();
foreach ($terms as $id => $term) {
if ($tsigns[$id] === false) {
$prohibitedTerms[] = $term;
unset($terms[$id]);
unset($tsigns[$id]);
unset($boostFactors[$id]);
}
}
if (count($terms) == 1) {
$clause = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
$signs[] = reset($tsigns);
// Clear terms list
$terms = array();
} else if (count($terms) > 1 && count(array_unique($boostFactors)) == 1) {
$clause = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
// Clause sign is 'required' if clause contains required terms. 'Optional' otherwise.
$signs[] = (in_array(true, $tsigns))? true : null;
// Clear terms list
$terms = array();
}
if (count($prohibitedTerms) == 1) {
// (boost factors are not significant for prohibited clauses)
$subqueries[] = new Zend_Search_Lucene_Search_Query_Term(reset($prohibitedTerms));
$signs[] = false;
// Clear prohibited terms list
$prohibitedTerms = array();
} else if (count($prohibitedTerms) > 1) {
// prepare signs array
$prohibitedSigns = array();
foreach ($prohibitedTerms as $id => $term) {
// all prohibited term are grouped as optional into multi-term query
$prohibitedSigns[$id] = null;
}
// (boost factors are not significant for prohibited clauses)
$subqueries[] = new Zend_Search_Lucene_Search_Query_MultiTerm($prohibitedTerms, $prohibitedSigns);
// Clause sign is 'prohibited'
$signs[] = false;
// Clear terms list
$prohibitedTerms = array();
}
/** @todo Group terms with the same boost factors together */
// Check, that all terms are processed
// Replace candidate for optimized query
if (count($terms) == 0 && count($prohibitedTerms) == 0) {
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
}
return $optimizedQuery;
}
/**
* Returns subqueries
*
* @return array
*/
public function getSubqueries()
{
return $this->_subqueries;
}
/**
* Return subqueries signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_Boolean($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '<subquery1> AND <subquery2> AND <subquery3>')
*/
private function _calculateConjunctionResult()
{
$this->_resVector = null;
if (count($this->_subqueries) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_subqueries as $subqueryId => $subquery) {
$resVectors[] = $subquery->matchedDocs();
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $subqueryId;
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Used algorithm doesn't change elements order
}
/**
* Calculate result vector for non Conjunction query
* (like '<subquery1> AND <subquery2> AND NOT <subquery3> OR <subquery4>')
*/
private function _calculateNonConjunctionResult()
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs[$subqueryId] === true) {
// required
$requiredVectors[] = $subquery->matchedDocs();
$requiredVectorsSizes[] = count(end($requiredVectors));
$requiredVectorsIds[] = $subqueryId;
} elseif ($this->_signs[$subqueryId] === false) {
// prohibited
// Do nothing. matchedDocs() may include non-matching id's
// Calculating prohibited vector may take significant time, but do not affect the result
// Skipped.
} else {
// neither required, nor prohibited
// array union
$optional += $subquery->matchedDocs();
}
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = &$required;
} else {
$this->_resVector = &$optional;
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all subqueries are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries),
count($this->_subqueries) );
}
$score = 0;
foreach ($this->_subqueries as $subquery) {
$subscore = $subquery->score($docId, $reader);
if ($subscore == 0) {
return 0;
}
$score += $subquery->score($docId, $reader) * $this->_coord;
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all subqueries are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _nonConjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0;
$matchedSubqueries = 0;
foreach ($this->_subqueries as $subqueryId => $subquery) {
$subscore = $subquery->score($docId, $reader);
// Prohibited
if ($this->_signs[$subqueryId] === false && $subscore != 0) {
return 0;
}
// is required, but doen't match
if ($this->_signs[$subqueryId] === true && $subscore == 0) {
return 0;
}
if ($subscore != 0) {
$matchedSubqueries++;
$score += $subscore;
}
}
return $score * $this->_coord[$matchedSubqueries] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Initialize weight if it's not done yet
$this->_initWeight($reader);
if ($docsFilter === null) {
// Create local documents filter if it's not provided by upper query
$docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
}
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs == null || $this->_signs[$subqueryId] === true) {
// Subquery is required
$subquery->execute($reader, $docsFilter);
} else {
$subquery->execute($reader);
}
}
if ($this->_signs === null) {
$this->_calculateConjunctionResult();
} else {
$this->_calculateNonConjunctionResult();
}
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
$terms = array();
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$terms = array_merge($terms, $subquery->getQueryTerms());
}
}
return $terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$subquery->highlightMatchesDOM($doc, $colorIndex);
}
}
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_subqueries as $id => $subquery) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} else if ($this->_signs[$id] === false) {
$query .= '-';
}
$query .= '(' . $subquery->__toString() . ')';
if ($subquery->getBoost() != 1) {
$query .= '^' . round($subquery->getBoost(), 4);
}
}
return $query;
}
}
QueryEntry/Subquery.php 0000666 00000004431 15125175533 0011240 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Subquery extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Query
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Object constractor
*
* @param Zend_Search_Lucene_Search_Query $query
*/
public function __construct(Zend_Search_Lucene_Search_Query $query)
{
$this->_query = $query;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' sign must follow term or phrase');
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
*/
public function getQuery($encoding)
{
$this->_query->setBoost($this->_boost);
return $this->_query;
}
}
QueryEntry/Term.php 0000666 00000014274 15125175533 0010336 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Term extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Term value
*
* @var string
*/
private $_term;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Fuzzy search query
*
* @var boolean
*/
private $_fuzzyQuery = false;
/**
* Similarity
*
* @var float
*/
private $_similarity = 1.;
/**
* Object constractor
*
* @param string $term
* @param string $field
*/
public function __construct($term, $field)
{
$this->_term = $term;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_fuzzyQuery = true;
if ($parameter !== null) {
$this->_similarity = $parameter;
} else {
$this->_similarity = Zend_Search_Lucene_Search_Query_Fuzzy::DEFAULT_MIN_SIMILARITY;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function getQuery($encoding)
{
if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) {
if ($this->_fuzzyQuery) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported for terms with wildcards.');
}
$pattern = '';
$subPatterns = explode('*', $this->_term);
$astericFirstPass = true;
foreach ($subPatterns as $subPattern) {
if (!$astericFirstPass) {
$pattern .= '*';
} else {
$astericFirstPass = false;
}
$subPatternsL2 = explode('?', $subPattern);
$qMarkFirstPass = true;
foreach ($subPatternsL2 as $subPatternL2) {
if (!$qMarkFirstPass) {
$pattern .= '?';
} else {
$qMarkFirstPass = false;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPatternL2, $encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
}
foreach ($tokens as $token) {
$pattern .= $token->getTermText();
}
}
}
$term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
$query->setBoost($this->_boost);
return $query;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding);
if (count($tokens) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1 && !$this->_fuzzyQuery) {
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->_boost);
return $query;
}
if (count($tokens) == 1 && $this->_fuzzyQuery) {
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_similarity);
$query->setBoost($this->_boost);
return $query;
}
if ($this->_fuzzyQuery) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
}
//It's not empty or one term query
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
/**
* @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
* analizer design features
*/
foreach ($tokens as $token) {
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
$query->addTerm($term, true); // all subterms are required
}
$query->setBoost($this->_boost);
return $query;
}
}
QueryEntry/Phrase.php 0000666 00000007712 15125175533 0010650 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Phrase extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Phrase value
*
* @var string
*/
private $_phrase;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Proximity phrase query
*
* @var boolean
*/
private $_proximityQuery = false;
/**
* Words distance, used for proximiti queries
*
* @var integer
*/
private $_wordsDistance = 0;
/**
* Object constractor
*
* @param string $phrase
* @param string $field
*/
public function __construct($phrase, $field)
{
$this->_phrase = $phrase;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_proximityQuery = true;
if ($parameter !== null) {
$this->_wordsDistance = $parameter;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function getQuery($encoding)
{
if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $encoding);
if (count($tokens) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1) {
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->_boost);
return $query;
}
//It's not empty or one term query
$position = -1;
$query = new Zend_Search_Lucene_Search_Query_Phrase();
foreach ($tokens as $token) {
$position += $token->getPositionIncrement();
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
$query->addTerm($term, $position);
}
if ($this->_proximityQuery) {
$query->setSlop($this->_wordsDistance);
}
$query->setBoost($this->_boost);
return $query;
}
}
Lucene/Search/Weight/Term.php 0000666 00000006174 15125712134 0012050 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* Term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Query weight
*
* @var float
*/
private $_queryWeight;
/**
* Zend_Search_Lucene_Search_Weight_Term constructor
* reader - index reader
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Index_Term $term,
Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_term = $term;
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}
Lucene/Search/Weight/Empty.php 0000666 00000003010 15125712134 0012221 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Empty extends Zend_Search_Lucene_Search_Weight
{
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
return 1;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
}
}
Lucene/Search/Weight/MultiTerm.php 0000666 00000007157 15125712134 0013065 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Query terms weights
* Array of Zend_Search_Lucene_Search_Weight_Term
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getTerms() as $id => $term) {
if ($signs === null || $signs[$id] === null || $signs[$id]) {
$this->_weights[$id] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader);
$query->setWeight($id, $this->_weights[$id]);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}
Lucene/Search/Weight/Boolean.php 0000666 00000007017 15125712134 0012515 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Boolean extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Queries weights
* Array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_Boolean constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getSubqueries() as $num => $subquery) {
if ($signs === null || $signs[$num] === null || $signs[$num]) {
$this->_weights[$num] = $subquery->createWeight($reader);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}
Lucene/Search/Weight/Phrase.php 0000666 00000005457 15125712134 0012366 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene_Search_Weight
*/
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query_Phrase
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Zend_Search_Lucene_Search_Weight_Phrase constructor
*
* @param Zend_Search_Lucene_Search_Query_Phrase $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}
Lucene/Search/Similarity/Default.php 0000666 00000005312 15125712134 0013415 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity
{
/**
* Implemented as '1/sqrt(numTerms)'.
*
* @param string $fieldName
* @param integer $numTerms
* @return float
*/
public function lengthNorm($fieldName, $numTerms)
{
if ($numTerms == 0) {
return 1E10;
}
return 1.0/sqrt($numTerms);
}
/**
* Implemented as '1/sqrt(sumOfSquaredWeights)'.
*
* @param float $sumOfSquaredWeights
* @return float
*/
public function queryNorm($sumOfSquaredWeights)
{
return 1.0/sqrt($sumOfSquaredWeights);
}
/**
* Implemented as 'sqrt(freq)'.
*
* @param float $freq
* @return float
*/
public function tf($freq)
{
return sqrt($freq);
}
/**
* Implemented as '1/(distance + 1)'.
*
* @param integer $distance
* @return float
*/
public function sloppyFreq($distance)
{
return 1.0/($distance + 1);
}
/**
* Implemented as 'log(numDocs/(docFreq+1)) + 1'.
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
public function idfFreq($docFreq, $numDocs)
{
return log($numDocs/(float)($docFreq+1)) + 1.0;
}
/**
* Implemented as 'overlap/maxOverlap'.
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
public function coord($overlap, $maxOverlap)
{
return $overlap/(float)$maxOverlap;
}
}
Lucene/Search/QueryParserContext.php 0000666 00000033033 15125712134 0013533 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryToken */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/** Zend_Search_Lucene_Search_Query_Term */
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/** Zend_Search_Lucene_Search_Query_Boolean */
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
/** Zend_Search_Lucene_Search_Query_Phrase */
require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
/** Zend_Search_Lucene_Search_BooleanExpressionRecognizer */
require_once 'Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryParserContext
{
/**
* Default field for the context.
*
* null means, that term should be searched through all fields
* Zend_Search_Lucene_Search_Query::rewriteQuery($index) transletes such queries to several
*
* @var string|null
*/
private $_defaultField;
/**
* Field specified for next entry
*
* @var string
*/
private $_nextEntryField = null;
/**
* True means, that term is required.
* False means, that term is prohibited.
* null means, that term is neither prohibited, nor required
*
* @var boolean
*/
private $_nextEntrySign = null;
/**
* Entries grouping mode
*/
const GM_SIGNS = 0; // Signs mode: '+term1 term2 -term3 +(subquery1) -(subquery2)'
const GM_BOOLEAN = 1; // Boolean operators mode: 'term1 and term2 or (subquery1) and not (subquery2)'
/**
* Grouping mode
*
* @var integer
*/
private $_mode = null;
/**
* Entries signs.
* Used in GM_SIGNS grouping mode
*
* @var arrays
*/
private $_signs = array();
/**
* Query entries
* Each entry is a Zend_Search_Lucene_Search_QueryEntry object or
* boolean operator (Zend_Search_Lucene_Search_QueryToken class constant)
*
* @var array
*/
private $_entries = array();
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Context object constructor
*
* @param string $encoding
* @param string|null $defaultField
*/
public function __construct($encoding, $defaultField = null)
{
$this->_encoding = $encoding;
$this->_defaultField = $defaultField;
}
/**
* Get context default field
*
* @return string|null
*/
public function getField()
{
return ($this->_nextEntryField !== null) ? $this->_nextEntryField : $this->_defaultField;
}
/**
* Set field for next entry
*
* @param string $field
*/
public function setNextEntryField($field)
{
$this->_nextEntryField = $field;
}
/**
* Set sign for next entry
*
* @param integer $sign
* @throws Zend_Search_Lucene_Exception
*/
public function setNextEntrySign($sign)
{
if ($this->_mode === self::GM_BOOLEAN) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_SIGNS;
if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED) {
$this->_nextEntrySign = true;
} else if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED) {
$this->_nextEntrySign = false;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized sign type.');
}
}
/**
* Add entry to a query
*
* @param Zend_Search_Lucene_Search_QueryEntry $entry
*/
public function addEntry(Zend_Search_Lucene_Search_QueryEntry $entry)
{
if ($this->_mode !== self::GM_BOOLEAN) {
$this->_signs[] = $this->_nextEntrySign;
}
$this->_entries[] = $entry;
$this->_nextEntryField = null;
$this->_nextEntrySign = null;
}
/**
* Process fuzzy search or proximity search modifier
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) {
// there are no entries or last entry is boolean operator
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry->processFuzzyProximityModifier($parameter);
$this->_entries[] = $lastEntry;
}
/**
* Set boost factor to the entry
*
* @param float $boostFactor
*/
public function boost($boostFactor)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) {
// there are no entries or last entry is boolean operator
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry->boost($boostFactor);
$this->_entries[] = $lastEntry;
}
/**
* Process logical operator
*
* @param integer $operator
*/
public function addLogicalOperator($operator)
{
if ($this->_mode === self::GM_SIGNS) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_BOOLEAN;
$this->_entries[] = $operator;
}
/**
* Generate 'signs style' query from the context
* '+term1 term2 -term3 +(<subquery1>) ...'
*
* @return Zend_Search_Lucene_Search_Query
*/
public function _signStyleExpressionQuery()
{
$query = new Zend_Search_Lucene_Search_Query_Boolean();
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
$defaultSign = true; // required
} else {
// Zend_Search_Lucene_Search_QueryParser::B_OR
$defaultSign = null; // optional
}
foreach ($this->_entries as $entryId => $entry) {
$sign = ($this->_signs[$entryId] !== null) ? $this->_signs[$entryId] : $defaultSign;
$query->addSubquery($entry->getQuery($this->_encoding), $sign);
}
return $query;
}
/**
* Generate 'boolean style' query from the context
* 'term1 and term2 or term3 and (<subquery1>) and not (<subquery2>)'
*
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene
*/
private function _booleanExpressionQuery()
{
/**
* We treat each level of an expression as a boolean expression in
* a Disjunctive Normal Form
*
* AND operator has higher precedence than OR
*
* Thus logical query is a disjunction of one or more conjunctions of
* one or more query entries
*/
$expressionRecognizer = new Zend_Search_Lucene_Search_BooleanExpressionRecognizer();
require_once 'Zend/Search/Lucene/Exception.php';
try {
foreach ($this->_entries as $entry) {
if ($entry instanceof Zend_Search_Lucene_Search_QueryEntry) {
$expressionRecognizer->processLiteral($entry);
} else {
switch ($entry) {
case Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_AND_OPERATOR);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_OR_OPERATOR);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_NOT_OPERATOR);
break;
default:
throw new Zend_Search_Lucene('Boolean expression error. Unknown operator type.');
}
}
}
$conjuctions = $expressionRecognizer->finishExpression();
} catch (Zend_Search_Exception $e) {
// throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error. Error message: \'' .
// $e->getMessage() . '\'.' );
// It's query syntax error message and it should be user friendly. So FSM message is omitted
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error.');
}
// Remove 'only negative' conjunctions
foreach ($conjuctions as $conjuctionId => $conjuction) {
$nonNegativeEntryFound = false;
foreach ($conjuction as $conjuctionEntry) {
if ($conjuctionEntry[1]) {
$nonNegativeEntryFound = true;
break;
}
}
if (!$nonNegativeEntryFound) {
unset($conjuctions[$conjuctionId]);
}
}
$subqueries = array();
foreach ($conjuctions as $conjuction) {
// Check, if it's a one term conjuction
if (count($conjuction) == 1) {
$subqueries[] = $conjuction[0][0]->getQuery($this->_encoding);
} else {
$subquery = new Zend_Search_Lucene_Search_Query_Boolean();
foreach ($conjuction as $conjuctionEntry) {
$subquery->addSubquery($conjuctionEntry[0]->getQuery($this->_encoding), $conjuctionEntry[1]);
}
$subqueries[] = $subquery;
}
}
if (count($subqueries) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($subqueries) == 1) {
return $subqueries[0];
}
$query = new Zend_Search_Lucene_Search_Query_Boolean();
foreach ($subqueries as $subquery) {
// Non-requirered entry/subquery
$query->addSubquery($subquery);
}
return $query;
}
/**
* Generate query from current context
*
* @return Zend_Search_Lucene_Search_Query
*/
public function getQuery()
{
if ($this->_mode === self::GM_BOOLEAN) {
return $this->_booleanExpressionQuery();
} else {
return $this->_signStyleExpressionQuery();
}
}
}
Lucene/Search/QueryParserException.php 0000666 00000002363 15125712134 0014047 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene base exception
*/
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*
* Special exception type, which may be used to intercept wrong user input
*/
class Zend_Search_Lucene_Search_QueryParserException extends Zend_Search_Lucene_Exception
{}
Lucene/Search/Query/Empty.php 0000666 00000007334 15125712134 0012114 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Empty */
require_once 'Zend/Search/Lucene/Search/Weight/Empty.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Empty extends Zend_Search_Lucene_Search_Query
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// "Empty" query is a primitive query and don't need to be optimized
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
return new Zend_Search_Lucene_Search_Weight_Empty();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<EmptyQuery>';
}
}
Lucene/Search/Query/MultiTerm.php 0000666 00000047532 15125712134 0012744 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term
*
* @var array
*/
private $_terms = array();
/**
* Term signs.
* If true then term is required.
* If false then term is prohibited.
* If null then term is neither prohibited, nor required
*
* If array is null then all terms are required
*
* @var array
*/
private $_signs;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => freq, ...)
* term2Id => (docId => freq, ...)
*
* @var array
*/
private $_termsFreqs = array();
/**
* A score factor based on the fraction of all query terms
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Terms weights
* array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights = array();
/**
* Class constructor. Create a new multi-term query object.
*
* if $signs array is omitted then all terms are required
* it differs from addTerm() behavior, but should never be used
*
* @param array $terms Array of Zend_Search_Lucene_Index_Term objects
* @param array $signs Array of signs. Sign is boolean|null.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $signs = null)
{
if (is_array($terms)) {
if (count($terms) > Zend_search_lucene::getTermsPerQueryLimit()) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$this->_terms = $terms;
$this->_signs = null;
// Check if all terms are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $term (Zend_Search_Lucene_Index_Term) to this query.
*
* The sign is specified as:
* TRUE - term is required
* FALSE - term is prohibited
* NULL - term is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean|null $sign
* @return void
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) {
if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required
if ($this->_signs === null) { // Check, If all previous terms are required
$this->_signs = array();
foreach ($this->_terms as $prevTerm) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_terms[] = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if (count($this->_terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check, that all fields are qualified
$allQualified = true;
foreach ($this->_terms as $term) {
if ($term->field === null) {
$allQualified = false;
break;
}
}
if ($allQualified) {
return $this;
} else {
/** transform multiterm query to boolean and apply rewrite() method to subqueries. */
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($this->_terms as $termId => $term) {
$subquery = new Zend_Search_Lucene_Search_Query_Term($term);
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$termId]);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
$terms = $this->_terms;
$signs = $this->_signs;
foreach ($terms as $id => $term) {
if (!$index->hasTerm($term)) {
if ($signs === null || $signs[$id] === true) {
// Term is required
return new Zend_Search_Lucene_Search_Query_Empty();
} else {
// Term is optional or prohibited
// Remove it from terms and signs list
unset($terms[$id]);
unset($signs[$id]);
}
}
}
// Check if all presented terms are prohibited
$allProhibited = true;
if ($signs === null) {
$allProhibited = false;
} else {
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
}
if ($allProhibited) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
/**
* @todo make an optimization for repeated terms
* (they may have different signs)
*/
if (count($terms) == 1) {
// It's already checked, that it's not a prohibited term
// It's one term query with one required or optional element
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
$optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs);
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Return terms signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '+something +another')
*
* @param Zend_Search_Lucene_Interface $reader
*/
private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
// Order terms by selectivity
$docFreqs = array();
$ids = array();
foreach ($this->_terms as $id => $term) {
$docFreqs[] = $reader->docFreq($term);
$ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
}
array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
$ids, SORT_ASC, SORT_NUMERIC,
$this->_terms);
$docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
foreach ($this->_terms as $termId => $term) {
$termDocs = $reader->termDocs($term, $docsFilter);
}
// Treat last retrieved docs vector as a result set
// (filter collects data for other terms)
$this->_resVector = array_flip($termDocs);
foreach ($this->_terms as $termId => $term) {
$this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithms doesn't change elements order.
}
/**
* Calculate result vector for non Conjunction query
* (like '+something -another')
*
* @param Zend_Search_Lucene_Interface $reader
*/
private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader)
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
$prohibited = array();
foreach ($this->_terms as $termId => $term) {
$termDocs = array_flip($reader->termDocs($term));
if ($this->_signs[$termId] === true) {
// required
$requiredVectors[] = $termDocs;
$requiredVectorsSizes[] = count($termDocs);
$requiredVectorsIds[] = $termId;
} elseif ($this->_signs[$termId] === false) {
// prohibited
// array union
$prohibited += $termDocs;
} else {
// neither required, nor prohibited
// array union
$optional += $termDocs;
}
$this->_termsFreqs[$termId] = $reader->termFreqs($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = $required;
} else {
$this->_resVector = $optional;
}
if (count($prohibited) != 0) {
// $this->_resVector = array_diff_key($this->_resVector, $prohibited);
/**
* This code is used as workaround for array_diff_key() slowness problem.
*/
if (count($this->_resVector) < count($prohibited)) {
$updatedVector = $this->_resVector;
foreach ($this->_resVector as $id => $value) {
if (isset($prohibited[$id])) {
unset($updatedVector[$id]);
}
}
$this->_resVector = $updatedVector;
} else {
$updatedVector = $this->_resVector;
foreach ($prohibited as $id => $value) {
unset($updatedVector[$id]);
}
$this->_resVector = $updatedVector;
}
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
count($this->_terms) );
}
$score = 0.0;
foreach ($this->_terms as $termId => $term) {
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _nonConjunctionScore($docId, $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0.0;
$matchedTerms = 0;
foreach ($this->_terms as $termId=>$term) {
// Check if term is
if ($this->_signs[$termId] !== false && // not prohibited
isset($this->_termsFreqs[$termId][$docId]) // matched
) {
$matchedTerms++;
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score +=
$reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
}
return $score * $this->_coord[$matchedTerms] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
if ($this->_signs === null) {
$this->_calculateConjunctionResult($reader);
} else {
$this->_calculateNonConjunctionResult($reader);
}
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
if ($this->_signs === null) {
return $this->_terms;
}
$terms = array();
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$terms[] = $this->_terms[$id];
}
}
return $terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
if ($this->_signs === null) {
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
} else {
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$words[] = $this->_terms[$id]->text;
}
}
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} else if ($this->_signs[$id] === false) {
$query .= '-';
}
if ($term->field !== null) {
$query .= $term->field . ':';
}
$query .= $term->text;
}
if ($this->getBoost() != 1) {
$query = '(' . $query . ')^' . $this->getBoost();
}
return $query;
}
}
Lucene/Search/Query/Range.php 0000666 00000024544 15125712134 0012054 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Range extends Zend_Search_Lucene_Search_Query
{
/**
* Lower term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_lowerTerm;
/**
* Upper term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_upperTerm;
/**
* Search field
*
* @var string
*/
private $_field;
/**
* Inclusive
*
* @var boolean
*/
private $_inclusive;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches;
/**
* Zend_Search_Lucene_Search_Query_Range constructor.
*
* @param Zend_Search_Lucene_Index_Term|null $lowerTerm
* @param Zend_Search_Lucene_Index_Term|null $upperTerm
* @param boolean $inclusive
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($lowerTerm, $upperTerm, $inclusive)
{
if ($lowerTerm === null && $upperTerm === null) {
throw new Zend_Search_Lucene_Exception('At least one term must be non-null');
}
if ($lowerTerm !== null && $upperTerm !== null && $lowerTerm->field != $upperTerm->field) {
throw new Zend_Search_Lucene_Exception('Both terms must be for the same field');
}
$this->_field = ($lowerTerm !== null)? $lowerTerm->field : $upperTerm->field;
$this->_lowerTerm = $lowerTerm;
$this->_upperTerm = $upperTerm;
$this->_inclusive = $inclusive;
}
/**
* Get query field name
*
* @return string|null
*/
public function getField()
{
return $this->_field;
}
/**
* Get lower term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function getLowerTerm()
{
return $this->_lowerTerm;
}
/**
* Get upper term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function getUpperTerm()
{
return $this->_upperTerm;
}
/**
* Get upper term
*
* @return boolean
*/
public function isInclusive()
{
return $this->_inclusive;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
if ($this->_field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_field);
}
$maxTerms = Zend_search_lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($this->_lowerTerm !== null) {
$lowerTerm = new Zend_Search_Lucene_Index_Term($this->_lowerTerm->text, $field);
$index->skipTo($lowerTerm);
if (!$this->_inclusive &&
$index->currentTerm() == $lowerTerm) {
// Skip lower term
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
}
if ($this->_upperTerm !== null) {
// Walk up to the upper term
$upperTerm = new Zend_Search_Lucene_Index_Term($this->_upperTerm->text, $field);
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
$index->currentTerm()->text < $upperTerm->text) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$index->nextTerm();
}
if ($this->_inclusive && $index->currentTerm() == $upperTerm) {
// Include upper term into result
$this->_matches[] = $upperTerm;
}
} else {
// Walk up to the end of field data
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
foreach ($this->_matches as $term) {
$words[] = $term->text;
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_field === null)? '' : $this->_field . ':')
. (($this->_inclusive)? '[' : '{')
. (($this->_lowerTerm !== null)? $this->_lowerTerm->text : 'null')
. ' TO '
. (($this->_upperTerm !== null)? $this->_upperTerm->text : 'null')
. (($this->_inclusive)? ']' : '}');
}
}
Lucene/Search/Query/Term.php 0000666 00000014063 15125712134 0011722 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Term */
require_once 'Zend/Search/Lucene/Search/Weight/Term.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query
{
/**
* Term to find.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* Documents vector.
*
* @var array
*/
private $_docVector = null;
/**
* Term freqs vector.
* array(docId => freq, ...)
*
* @var array
*/
private $_termFreqs;
/**
* Zend_Search_Lucene_Search_Query_Term constructor
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean $sign
*/
public function __construct(Zend_Search_Lucene_Index_Term $term)
{
$this->_term = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if ($this->_term->field != null) {
return $this;
} else {
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$term = new Zend_Search_Lucene_Index_Term($this->_term->text, $fieldName);
$query->addTerm($term);
}
return $query->rewrite($index);
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// Check, that index contains specified term
if (!$index->hasTerm($this->_term)) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader);
return $this->_weight;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
$this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter));
$this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter);
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_docVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_docVector[$docId])) {
return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) *
$this->_weight->getValue() *
$reader->norm($docId, $this->_term->field) *
$this->getBoost();
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array($this->_term);
}
/**
* Return query term
*
* @return Zend_Search_Lucene_Index_Term
*/
public function getTerm()
{
return $this->_term;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$doc->highlight($this->_term->text, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_term->field === null)? '':$this->_term->field . ':') . $this->_term->text;
}
}
Lucene/Search/Query/Phrase.php 0000666 00000043011 15125712134 0012230 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Zend_Search_Lucene_Search_Query
*/
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm
*/
require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php';
/**
* A Query that matches documents containing a particular sequence of terms.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term objects.
*
* @var array
*/
private $_terms;
/**
* Term positions (relative positions of terms within the phrase).
* Array of integers
*
* @var array
*/
private $_offsets;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var integer
*/
private $_slop;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => array( pos1, pos2, ... ), ...)
* term2Id => (docId => array( pos1, pos2, ... ), ...)
*
* @var array
*/
private $_termsPositions = array();
/**
* Class constructor. Create a new prase query.
*
* @param string $field Field to search.
* @param array $terms Terms to search Array of strings.
* @param array $offsets Relative term positions. Array of integers.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $offsets = null, $field = null)
{
$this->_slop = 0;
if (is_array($terms)) {
$this->_terms = array();
foreach ($terms as $termId => $termText) {
$this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
new Zend_Search_Lucene_Index_Term($termText);
}
} else if ($terms === null) {
$this->_terms = array();
} else {
throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
}
if (is_array($offsets)) {
if (count($this->_terms) != count($offsets)) {
throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
}
$this->_offsets = $offsets;
} else if ($offsets === null) {
$this->_offsets = array();
foreach ($this->_terms as $termId => $term) {
$position = count($this->_offsets);
$this->_offsets[$termId] = $position;
}
} else {
throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
}
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Adds a term to the end of the query phrase.
* The relative position of the term is specified explicitly or the one immediately
* after the last term added.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $position
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
$term->field . ':' . $term->text);
}
$this->_terms[] = $term;
if ($position !== null) {
$this->_offsets[] = $position;
} else if (count($this->_offsets) != 0) {
$this->_offsets[] = end($this->_offsets) + 1;
} else {
$this->_offsets[] = 0;
}
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if (count($this->_terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if ($this->_terms[0]->field !== null) {
return $this;
} else {
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$subquery = new Zend_Search_Lucene_Search_Query_Phrase();
$subquery->setSlop($this->getSlop());
foreach ($this->_terms as $termId => $term) {
$qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName);
$subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
}
$query->addSubquery($subquery);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// Check, that index contains all phrase terms
foreach ($this->_terms as $term) {
if (!$index->hasTerm($term)) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
}
if (count($this->_terms) == 1) {
// It's one term query
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($this->_terms) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
return $this;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
return $this->_weight;
}
/**
* Score calculator for exact phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @return float
*/
public function _exactPhraseFreq($docId)
{
$freq = 0;
// Term Id with lowest cardinality
$lowCardTermId = null;
// Calculate $lowCardTermId
foreach ($this->_terms as $termId => $term) {
if ($lowCardTermId === null ||
count($this->_termsPositions[$termId][$docId]) <
count($this->_termsPositions[$lowCardTermId][$docId]) ) {
$lowCardTermId = $termId;
}
}
// Walk through positions of the term with lowest cardinality
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
// We expect phrase to be found
$freq++;
// Walk through other terms
foreach ($this->_terms as $termId => $term) {
if ($termId != $lowCardTermId) {
$expectedPosition = $lowCardPos +
($this->_offsets[$termId] -
$this->_offsets[$lowCardTermId]);
if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
$freq--; // Phrase wasn't found.
break;
}
}
}
}
return $freq;
}
/**
* Score calculator for sloppy phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader)
{
$freq = 0;
$phraseQueue = array();
$phraseQueue[0] = array(); // empty phrase
$lastTerm = null;
// Walk through the terms to create phrases.
foreach ($this->_terms as $termId => $term) {
$queueSize = count($phraseQueue);
$firstPass = true;
// Walk through the term positions.
// Each term position produces a set of phrases.
foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
if ($firstPass) {
for ($count = 0; $count < $queueSize; $count++) {
$phraseQueue[$count][$termId] = $termPosition;
}
} else {
for ($count = 0; $count < $queueSize; $count++) {
if ($lastTerm !== null &&
abs( $termPosition - $phraseQueue[$count][$lastTerm] -
($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
continue;
}
$newPhraseId = count($phraseQueue);
$phraseQueue[$newPhraseId] = $phraseQueue[$count];
$phraseQueue[$newPhraseId][$termId] = $termPosition;
}
}
$firstPass = false;
}
$lastTerm = $termId;
}
foreach ($phraseQueue as $phrasePos) {
$minDistance = null;
for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
$distance = 0;
$start = reset($phrasePos) - reset($this->_offsets) + $shift;
foreach ($this->_terms as $termId => $term) {
$distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
if($distance > $this->_slop) {
break;
}
}
if ($minDistance === null || $distance < $minDistance) {
$minDistance = $distance;
}
}
if ($minDistance <= $this->_slop) {
$freq += $reader->getSimilarity()->sloppyFreq($minDistance);
}
}
return $freq;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_terms as $termId => $term) {
$resVectors[] = array_flip($reader->termDocs($term));
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $termId;
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithm doesn't change elements order.
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_slop == 0) {
$freq = $this->_exactPhraseFreq($docId);
} else {
$freq = $this->_sloppyPhraseFreq($docId, $reader);
}
if ($freq != 0) {
$tf = $reader->getSimilarity()->tf($freq);
$weight = $this->_weight->getValue();
$norm = $reader->norm($docId, reset($this->_terms)->field);
return $tf * $weight * $norm * $this->getBoost();
}
// Included in result, but culculated freq is zero
return 0;
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return $this->_terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
$query .= $this->_terms[0]->field . ':';
}
$query .= '"';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
$query .= $term->text;
}
$query .= '"';
if ($this->_slop != 0) {
$query .= '~' . $this->_slop;
}
return $query;
}
}
Lucene/Search/Query/Fuzzy.php 0000666 00000037233 15125712134 0012146 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Fuzzy extends Zend_Search_Lucene_Search_Query
{
/** Default minimum similarity */
const DEFAULT_MIN_SIMILARITY = 0.5;
/**
* Maximum number of matched terms.
* Apache Lucene defines this limitation as boolean query maximum number of clauses:
* org.apache.lucene.search.BooleanQuery.getMaxClauseCount()
*/
const MAX_CLAUSE_COUNT = 1024;
/**
* Array of precalculated max distances
*
* keys are integers representing a word size
*/
private $_maxDistances = array();
/**
* Base searching term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* A value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* _minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* @var float
*/
private $_minimumSimilarity;
/**
* The length of common (non-fuzzy) prefix
*
* @var integer
*/
private $_prefixLength;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Matched terms scores
*
* @var array
*/
private $_scores = null;
/**
* Array of the term keys.
* Used to sort terms in alphabetical order if terms have the same socres
*
* @var array
*/
private $_termKeys = null;
/**
* Default non-fuzzy prefix length
*
* @var integer
*/
private static $_defaultPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param Zend_Search_Lucene_Index_Term $pattern
* @throws Zend_Search_Lucene_Exception
*/
public function __construct(Zend_Search_Lucene_Index_Term $term, $minimumSimilarity = self::DEFAULT_MIN_SIMILARITY, $prefixLength = null)
{
if ($minimumSimilarity < 0) {
throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be less than 0');
}
if ($minimumSimilarity >= 1) {
throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be greater than or equal to 1');
}
if ($prefixLength < 0) {
throw new Zend_Search_Lucene_Exception('prefixLength cannot be less than 0');
}
$this->_term = $term;
$this->_minimumSimilarity = $minimumSimilarity;
$this->_prefixLength = ($prefixLength !== null)? $prefixLength : self::$_defaultPrefixLength;
}
/**
* Get default non-fuzzy prefix length
*
* @return integer
*/
public static function getDefaultPrefixLength()
{
return self::$_defaultPrefixLength;
}
/**
* Set default non-fuzzy prefix length
*
* @param integer $defaultPrefixLength
*/
public static function setDefaultPrefixLength($defaultPrefixLength)
{
self::$_defaultPrefixLength = $defaultPrefixLength;
}
/**
* Calculate maximum distance for specified word length
*
* @param integer $prefixLength
* @param integer $termLength
* @param integer $length
* @return integer
*/
private function _calculateMaxDistance($prefixLength, $termLength, $length)
{
$this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength));
return $this->_maxDistances[$length];
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Exception
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
$this->_scores = array();
$this->_termKeys = array();
if ($this->_term->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_term->field);
}
$prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
$prefixByteLength = strlen($prefix);
$prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
$termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
$termRest = substr($this->_term->text, $prefixByteLength);
// we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
$termRestLength = strlen($termRest);
$scaleFactor = 1/(1 - $this->_minimumSimilarity);
$maxTerms = Zend_search_lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($prefix != '') {
$index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
// Calculate similarity
$target = substr($index->currentTerm()->text, $prefixByteLength);
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
if ($termRestLength == 0) {
// we don't have anything to compare. That means if we just add
// the letters for current term we get the new word
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
} else if (strlen($target) == 0) {
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
} else if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
// Calculate similarity
$target = $index->currentTerm()->text;
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance(0, $termRestLength, strlen($target));
if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_Boolean();
array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC,
$this->_termKeys, SORT_ASC, SORT_STRING,
$this->_matches);
$termCount = 0;
foreach ($this->_matches as $id => $matchedTerm) {
$subquery = new Zend_Search_Lucene_Search_Query_Term($matchedTerm);
$subquery->setBoost($this->_scores[$id]);
$rewrittenQuery->addSubquery($subquery);
$termCount++;
if ($termCount >= self::MAX_CLAUSE_COUNT) {
break;
}
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
foreach ($this->_matches as $term) {
$words[] = $term->text;
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_term->field === null)? '' : $this->_term->field . ':')
. $this->_term->text . '~'
. (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : '');
}
}
Lucene/Search/Query/Insignificant.php 0000666 00000007414 15125712134 0013602 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Empty */
require_once 'Zend/Search/Lucene/Search/Weight/Empty.php';
/**
* The insignificant query returns empty result, but doesn't limit result set as a part of other queries
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Insignificant extends Zend_Search_Lucene_Search_Query
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
return new Zend_Search_Lucene_Search_Weight_Empty();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<InsignificantQuery>';
}
}
Lucene/Search/Query/Wildcard.php 0000666 00000025623 15125712134 0012550 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Wildcard extends Zend_Search_Lucene_Search_Query
{
/**
* Search pattern.
*
* Field has to be fully specified or has to be null
* Text may contain '*' or '?' symbols
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_pattern;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Minimum term prefix length (number of minimum non-wildcard characters)
*
* @var integer
*/
private static $_minPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param Zend_Search_Lucene_Index_Term $pattern
*/
public function __construct(Zend_Search_Lucene_Index_Term $pattern)
{
$this->_pattern = $pattern;
}
/**
* Get minimum prefix length
*
* @return integer
*/
public static function getMinPrefixLength()
{
return self::$_minPrefixLength;
}
/**
* Set minimum prefix length
*
* @param integer $minPrefixLength
*/
public static function setMinPrefixLength($minPrefixLength)
{
self::$_minPrefixLength = $minPrefixLength;
}
/**
* Get terms prefix
*
* @param string $word
* @return string
*/
private static function _getPrefix($word)
{
$questionMarkPosition = strpos($word, '?');
$astrericPosition = strpos($word, '*');
if ($questionMarkPosition !== false) {
if ($astrericPosition !== false) {
return substr($word, 0, min($questionMarkPosition, $astrericPosition));
}
return substr($word, 0, $questionMarkPosition);
} else if ($astrericPosition !== false) {
return substr($word, 0, $astrericPosition);
}
return $word;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Exception
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
if ($this->_pattern->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_pattern->field);
}
$prefix = self::_getPrefix($this->_pattern->text);
$prefixLength = strlen($prefix);
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if ($prefixLength < self::$_minPrefixLength) {
throw new Zend_Search_Lucene_Exception('At least ' . self::$_minPrefixLength . ' non-wildcard terms are required.');
}
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$maxTerms = Zend_search_lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
if ($prefix != '') {
$index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Returns query pattern
*
* @return Zend_Search_Lucene_Index_Term
*/
public function getPattern()
{
return $this->_pattern;
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
$words = array();
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if (@preg_match('/\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($doc->getFieldUtf8Value('body'), 'UTF-8');
foreach ($tokens as $token) {
if (preg_match($matchExpression, $token->getTermText()) === 1) {
$words[] = $token->getTermText();
}
}
$doc->highlight($words, $this->_getHighlightColor($colorIndex));
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_pattern->field === null)? '' : $this->_pattern->field . ':') . $this->_pattern->text;
}
}
Lucene/Search/Query/Boolean.php 0000666 00000065650 15125712134 0012402 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/** Zend_Search_Lucene_Search_Weight_Boolean */
require_once 'Zend/Search/Lucene/Search/Weight/Boolean.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Boolean extends Zend_Search_Lucene_Search_Query
{
/**
* Subqueries
* Array of Zend_Search_Lucene_Search_Query
*
* @var array
*/
private $_subqueries = array();
/**
* Subqueries signs.
* If true then subquery is required.
* If false then subquery is prohibited.
* If null then subquery is neither prohibited, nor required
*
* If array is null then all subqueries are required
*
* @var array
*/
private $_signs = array();
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* A score factor based on the fraction of all query subqueries
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Class constructor. Create a new Boolean query object.
*
* if $signs array is omitted then all subqueries are required
* it differs from addSubquery() behavior, but should never be used
*
* @param array $subqueries Array of Zend_Search_Search_Query objects
* @param array $signs Array of signs. Sign is boolean|null.
* @return void
*/
public function __construct($subqueries = null, $signs = null)
{
if (is_array($subqueries)) {
$this->_subqueries = $subqueries;
$this->_signs = null;
// Check if all subqueries are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $subquery (Zend_Search_Lucene_Search_Query) to this query.
*
* The sign is specified as:
* TRUE - subquery is required
* FALSE - subquery is prohibited
* NULL - subquery is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Search_Query $subquery
* @param boolean|null $sign
* @return void
*/
public function addSubquery(Zend_Search_Lucene_Search_Query $subquery, $sign=null) {
if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required
if ($this->_signs === null) { // Check, If all previous subqueries are required
$this->_signs = array();
foreach ($this->_subqueries as $prevSubquery) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_subqueries[] = $subquery;
}
/**
* Re-write queries into primitive queries
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($this->_subqueries as $subqueryId => $subquery) {
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$subqueryId]);
}
return $query;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
$subqueries = array();
$signs = array();
// Optimize all subqueries
foreach ($this->_subqueries as $id => $subquery) {
$subqueries[] = $subquery->optimize($index);
$signs[] = ($this->_signs === null)? true : $this->_signs[$id];
}
// Remove insignificant subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
// Insignificant subquery has to be removed anyway
unset($subqueries[$id]);
unset($signs[$id]);
}
}
if (count($subqueries) == 0) {
// Boolean query doesn't has non-insignificant subqueries
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
// Check if all non-insignificant subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
// Check for empty subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Empty) {
if ($signs[$id] === true) {
// Matching is required, but is actually empty
return new Zend_Search_Lucene_Search_Query_Empty();
} else {
// Matching is optional or prohibited, but is empty
// Remove it from subqueries and signs list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if reduced subqueries list is empty
if (count($subqueries) == 0) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check if all non-empty subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check, if reduced subqueries list has only one entry
if (count($subqueries) == 1) {
// It's a query with only one required or optional clause
// (it's already checked, that it's not a prohibited clause)
if ($this->getBoost() == 1) {
return reset($subqueries);
}
$optimizedQuery = clone reset($subqueries);
$optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost());
return $optimizedQuery;
}
// Prepare first candidate for optimized query
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
$terms = array();
$tsigns = array();
$boostFactors = array();
// Try to decompose term and multi-term subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Term) {
$terms[] = $subquery->getTerm();
$tsigns[] = $signs[$id];
$boostFactors[] = $subquery->getBoost();
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else if ($subquery instanceof Zend_Search_Lucene_Search_Query_MultiTerm) {
$subTerms = $subquery->getTerms();
$subSigns = $subquery->getSigns();
if ($signs[$id] === true) {
// It's a required multi-term subquery.
// Something like '... +(+term1 -term2 term3 ...) ...'
// Multi-term required subquery can be decomposed only if it contains
// required terms and doesn't contain prohibited terms:
// ... +(+term1 term2 ...) ... => ... +term1 term2 ...
//
// Check this
$hasRequired = false;
$hasProhibited = false;
if ($subSigns === null) {
// All subterms are required
$hasRequired = true;
} else {
foreach ($subSigns as $sign) {
if ($sign === true) {
$hasRequired = true;
} else if ($sign === false) {
$hasProhibited = true;
break;
}
}
}
// Continue if subquery has prohibited terms or doesn't have required terms
if ($hasProhibited || !$hasRequired) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($subSigns === null)? true : $subSigns[$termId];
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else { // $signs[$id] === null || $signs[$id] === false
// It's an optional or prohibited multi-term subquery.
// Something like '... (+term1 -term2 term3 ...) ...'
// or
// something like '... -(+term1 -term2 term3 ...) ...'
// Multi-term optional and required subqueries can be decomposed
// only if all terms are optional.
//
// Check if all terms are optional.
$onlyOptional = true;
if ($subSigns === null) {
// All subterms are required
$onlyOptional = false;
} else {
foreach ($subSigns as $sign) {
if ($sign !== null) {
$onlyOptional = false;
break;
}
}
}
// Continue if non-optional terms are presented in this multi-term subquery
if (!$onlyOptional) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($signs[$id] === null)? null /* optional */ :
false /* prohibited */;
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if there are no decomposed subqueries
if (count($terms) == 0 ) {
// return prepared candidate
return $optimizedQuery;
}
// Check, if all subqueries have been decomposed and all terms has the same boost factor
if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) {
$optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
$optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost());
return $optimizedQuery;
}
// This boolean query can't be transformed to Term/MultiTerm query and still contains
// several subqueries
// Separate prohibited terms
$prohibitedTerms = array();
foreach ($terms as $id => $term) {
if ($tsigns[$id] === false) {
$prohibitedTerms[] = $term;
unset($terms[$id]);
unset($tsigns[$id]);
unset($boostFactors[$id]);
}
}
if (count($terms) == 1) {
$clause = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
$signs[] = reset($tsigns);
// Clear terms list
$terms = array();
} else if (count($terms) > 1 && count(array_unique($boostFactors)) == 1) {
$clause = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
// Clause sign is 'required' if clause contains required terms. 'Optional' otherwise.
$signs[] = (in_array(true, $tsigns))? true : null;
// Clear terms list
$terms = array();
}
if (count($prohibitedTerms) == 1) {
// (boost factors are not significant for prohibited clauses)
$subqueries[] = new Zend_Search_Lucene_Search_Query_Term(reset($prohibitedTerms));
$signs[] = false;
// Clear prohibited terms list
$prohibitedTerms = array();
} else if (count($prohibitedTerms) > 1) {
// prepare signs array
$prohibitedSigns = array();
foreach ($prohibitedTerms as $id => $term) {
// all prohibited term are grouped as optional into multi-term query
$prohibitedSigns[$id] = null;
}
// (boost factors are not significant for prohibited clauses)
$subqueries[] = new Zend_Search_Lucene_Search_Query_MultiTerm($prohibitedTerms, $prohibitedSigns);
// Clause sign is 'prohibited'
$signs[] = false;
// Clear terms list
$prohibitedTerms = array();
}
/** @todo Group terms with the same boost factors together */
// Check, that all terms are processed
// Replace candidate for optimized query
if (count($terms) == 0 && count($prohibitedTerms) == 0) {
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
}
return $optimizedQuery;
}
/**
* Returns subqueries
*
* @return array
*/
public function getSubqueries()
{
return $this->_subqueries;
}
/**
* Return subqueries signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
$this->_weight = new Zend_Search_Lucene_Search_Weight_Boolean($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '<subquery1> AND <subquery2> AND <subquery3>')
*/
private function _calculateConjunctionResult()
{
$this->_resVector = null;
if (count($this->_subqueries) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_subqueries as $subqueryId => $subquery) {
$resVectors[] = $subquery->matchedDocs();
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $subqueryId;
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Used algorithm doesn't change elements order
}
/**
* Calculate result vector for non Conjunction query
* (like '<subquery1> AND <subquery2> AND NOT <subquery3> OR <subquery4>')
*/
private function _calculateNonConjunctionResult()
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs[$subqueryId] === true) {
// required
$requiredVectors[] = $subquery->matchedDocs();
$requiredVectorsSizes[] = count(end($requiredVectors));
$requiredVectorsIds[] = $subqueryId;
} elseif ($this->_signs[$subqueryId] === false) {
// prohibited
// Do nothing. matchedDocs() may include non-matching id's
// Calculating prohibited vector may take significant time, but do not affect the result
// Skipped.
} else {
// neither required, nor prohibited
// array union
$optional += $subquery->matchedDocs();
}
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = &$required;
} else {
$this->_resVector = &$optional;
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all subqueries are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries),
count($this->_subqueries) );
}
$score = 0;
foreach ($this->_subqueries as $subquery) {
$subscore = $subquery->score($docId, $reader);
if ($subscore == 0) {
return 0;
}
$score += $subquery->score($docId, $reader) * $this->_coord;
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all subqueries are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _nonConjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0;
$matchedSubqueries = 0;
foreach ($this->_subqueries as $subqueryId => $subquery) {
$subscore = $subquery->score($docId, $reader);
// Prohibited
if ($this->_signs[$subqueryId] === false && $subscore != 0) {
return 0;
}
// is required, but doen't match
if ($this->_signs[$subqueryId] === true && $subscore == 0) {
return 0;
}
if ($subscore != 0) {
$matchedSubqueries++;
$score += $subscore;
}
}
return $score * $this->_coord[$matchedSubqueries] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Initialize weight if it's not done yet
$this->_initWeight($reader);
if ($docsFilter === null) {
// Create local documents filter if it's not provided by upper query
$docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
}
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs == null || $this->_signs[$subqueryId] === true) {
// Subquery is required
$subquery->execute($reader, $docsFilter);
} else {
$subquery->execute($reader);
}
}
if ($this->_signs === null) {
$this->_calculateConjunctionResult();
} else {
$this->_calculateNonConjunctionResult();
}
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
$terms = array();
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$terms = array_merge($terms, $subquery->getQueryTerms());
}
}
return $terms;
}
/**
* Highlight query terms
*
* @param integer &$colorIndex
* @param Zend_Search_Lucene_Document_Html $doc
*/
public function highlightMatchesDOM(Zend_Search_Lucene_Document_Html $doc, &$colorIndex)
{
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$subquery->highlightMatchesDOM($doc, $colorIndex);
}
}
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_subqueries as $id => $subquery) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} else if ($this->_signs[$id] === false) {
$query .= '-';
}
$query .= '(' . $subquery->__toString() . ')';
if ($subquery->getBoost() != 1) {
$query .= '^' . round($subquery->getBoost(), 4);
}
}
return $query;
}
}
Lucene/Search/QueryEntry/Subquery.php 0000666 00000004431 15125712134 0013652 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Subquery extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Query
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Object constractor
*
* @param Zend_Search_Lucene_Search_Query $query
*/
public function __construct(Zend_Search_Lucene_Search_Query $query)
{
$this->_query = $query;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' sign must follow term or phrase');
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
*/
public function getQuery($encoding)
{
$this->_query->setBoost($this->_boost);
return $this->_query;
}
}
Lucene/Search/QueryEntry/Term.php 0000666 00000014274 15125712134 0012750 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Term extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Term value
*
* @var string
*/
private $_term;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Fuzzy search query
*
* @var boolean
*/
private $_fuzzyQuery = false;
/**
* Similarity
*
* @var float
*/
private $_similarity = 1.;
/**
* Object constractor
*
* @param string $term
* @param string $field
*/
public function __construct($term, $field)
{
$this->_term = $term;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_fuzzyQuery = true;
if ($parameter !== null) {
$this->_similarity = $parameter;
} else {
$this->_similarity = Zend_Search_Lucene_Search_Query_Fuzzy::DEFAULT_MIN_SIMILARITY;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function getQuery($encoding)
{
if (strpos($this->_term, '?') !== false || strpos($this->_term, '*') !== false) {
if ($this->_fuzzyQuery) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is not supported for terms with wildcards.');
}
$pattern = '';
$subPatterns = explode('*', $this->_term);
$astericFirstPass = true;
foreach ($subPatterns as $subPattern) {
if (!$astericFirstPass) {
$pattern .= '*';
} else {
$astericFirstPass = false;
}
$subPatternsL2 = explode('?', $subPattern);
$qMarkFirstPass = true;
foreach ($subPatternsL2 as $subPatternL2) {
if (!$qMarkFirstPass) {
$pattern .= '?';
} else {
$qMarkFirstPass = false;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPatternL2, $encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
}
foreach ($tokens as $token) {
$pattern .= $token->getTermText();
}
}
}
$term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
$query->setBoost($this->_boost);
return $query;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_term, $encoding);
if (count($tokens) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1 && !$this->_fuzzyQuery) {
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->_boost);
return $query;
}
if (count($tokens) == 1 && $this->_fuzzyQuery) {
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_similarity);
$query->setBoost($this->_boost);
return $query;
}
if ($this->_fuzzyQuery) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
}
//It's not empty or one term query
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
/**
* @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
* analizer design features
*/
foreach ($tokens as $token) {
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
$query->addTerm($term, true); // all subterms are required
}
$query->setBoost($this->_boost);
return $query;
}
}
Lucene/Search/QueryEntry/Phrase.php 0000666 00000007712 15125712134 0013262 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Phrase extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Phrase value
*
* @var string
*/
private $_phrase;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Proximity phrase query
*
* @var boolean
*/
private $_proximityQuery = false;
/**
* Words distance, used for proximiti queries
*
* @var integer
*/
private $_wordsDistance = 0;
/**
* Object constractor
*
* @param string $phrase
* @param string $field
*/
public function __construct($phrase, $field)
{
$this->_phrase = $phrase;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_proximityQuery = true;
if ($parameter !== null) {
$this->_wordsDistance = $parameter;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function getQuery($encoding)
{
if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $encoding);
if (count($tokens) == 0) {
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1) {
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->_boost);
return $query;
}
//It's not empty or one term query
$position = -1;
$query = new Zend_Search_Lucene_Search_Query_Phrase();
foreach ($tokens as $token) {
$position += $token->getPositionIncrement();
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
$query->addTerm($term, $position);
}
if ($this->_proximityQuery) {
$query->setSlop($this->_wordsDistance);
}
$query->setBoost($this->_boost);
return $query;
}
}
Lucene/Index/Writer.php 0000666 00000106707 15125712134 0011033 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_SegmentMerger */
require_once 'Zend/Search/Lucene/Index/SegmentMerger.php';
/** Zend_Search_Lucene_LockManager */
require_once 'Zend/Search/Lucene/LockManager.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Writer
{
/**
* @todo Implement Analyzer substitution
* @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
* temporary index files
* @todo Directory lock processing
*/
/**
* Number of documents required before the buffered in-memory
* documents are written into a new Segment
*
* Default value is 10
*
* @var integer
*/
public $maxBufferedDocs = 10;
/**
* Largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @var integer
*/
public $maxMergeDocs = PHP_INT_MAX;
/**
* Determines how often segment indices are merged by addDocument().
*
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
*
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
*
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @var integer
*/
public $mergeFactor = 10;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory = null;
/**
* Changes counter.
*
* @var integer
*/
private $_versionUpdate = 0;
/**
* List of the segments, created by index writer
* Array of Zend_Search_Lucene_Index_SegmentInfo objects
*
* @var array
*/
private $_newSegments = array();
/**
* List of segments to be deleted on commit
*
* @var array
*/
private $_segmentsToDelete = array();
/**
* Current segment to add documents
*
* @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter
*/
private $_currentSegment = null;
/**
* Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
*
* It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos;
/**
* Index target format version
*
* @var integer
*/
private $_targetFormatVersion;
/**
* List of indexfiles extensions
*
* @var array
*/
private static $_indexExtensions = array('.cfs' => '.cfs',
'.cfx' => '.cfx',
'.fnm' => '.fnm',
'.fdx' => '.fdx',
'.fdt' => '.fdt',
'.tis' => '.tis',
'.tii' => '.tii',
'.frq' => '.frq',
'.prx' => '.prx',
'.tvx' => '.tvx',
'.tvd' => '.tvd',
'.tvf' => '.tvf',
'.del' => '.del',
'.sti' => '.sti' );
/**
* Create empty index
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param integer $generation
* @param integer $nameCount
*/
public static function createIndex(Zend_Search_Lucene_Storage_Directory $directory, $generation, $nameCount)
{
if ($generation == 0) {
// Create index in pre-2.1 mode
foreach ($directory->fileList() as $file) {
if ($file == 'deletable' ||
$file == 'segments' ||
isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) ||
preg_match('/\.f\d+$/i', $file) /* matches <segment_name>.f<decimal_nmber> file names */) {
$directory->deleteFile($file);
}
}
$segmentsFile = $directory->createFile('segments');
$segmentsFile->writeInt((int)0xFFFFFFFF);
// write version (is initialized by current time
// $segmentsFile->writeLong((int)microtime(true));
$version = microtime(true);
$segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
$segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
$deletableFile = $directory->createFile('deletable');
// write counter
$deletableFile->writeInt(0);
} else {
$genFile = $directory->createFile('segments.gen');
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation two times
$genFile->writeLong($generation);
$genFile->writeLong($generation);
$segmentsFile = $directory->createFile(Zend_Search_Lucene::getSegmentFileName($generation));
$segmentsFile->writeInt((int)0xFFFFFFFD);
// write version (is initialized by current time
// $segmentsFile->writeLong((int)microtime(true));
$version = microtime(true);
$segmentsFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
$segmentsFile->writeInt((int)($version & 0xFFFFFFFF));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
}
}
/**
* Open the index for writing
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param array $segmentInfos
* @param integer $targetFormatVersion
* @param Zend_Search_Lucene_Storage_File $cleanUpLock
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $targetFormatVersion)
{
$this->_directory = $directory;
$this->_segmentInfos = &$segmentInfos;
$this->_targetFormatVersion = $targetFormatVersion;
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
if ($this->_currentSegment === null) {
$this->_currentSegment =
new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName());
}
$this->_currentSegment->addDocument($document);
if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
$this->commit();
}
$this->_maybeMergeSegments();
$this->_versionUpdate++;
}
/**
* Check if we have anything to merge
*
* @return boolean
*/
private function _hasAnythingToMerge()
{
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
return true;
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
return false;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
return true;
}
return false;
}
/**
* Merge segments if necessary
*/
private function _maybeMergeSegments()
{
if (Zend_Search_Lucene_LockManager::obtainOptimizationLock($this->_directory) === false) {
return;
}
if (!$this->_hasAnythingToMerge()) {
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
// Perform standard auto-optimization procedure
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
}
/**
* Merge specified segments
*
* $segments is an array of SegmentInfo objects
*
* @param array $segments
*/
private function _mergeSegments($segments)
{
$newName = $this->_newSegmentName();
$merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory,
$newName);
foreach ($segments as $segmentInfo) {
$merger->addSource($segmentInfo);
$this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
}
$newSegment = $merger->merge();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->commit();
}
/**
* Update segments file by adding current segment to a list
*
* @throws Zend_Search_Lucene_Exception
*/
private function _updateSegments()
{
// Get an exclusive index lock
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
// Write down changes for the segments
foreach ($this->_segmentInfos as $segInfo) {
$segInfo->writeChanges();
}
$generation = Zend_Search_Lucene::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Zend_Search_Lucene::getSegmentFileName($generation), false);
$newSegmentFile = $this->_directory->createFile(Zend_Search_Lucene::getSegmentFileName(++$generation), false);
try {
$genFile = $this->_directory->getFileObject('segments.gen', false);
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
$genFile = $this->_directory->createFile('segments.gen');
} else {
throw $e;
}
}
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation (first copy)
$genFile->writeLong($generation);
try {
// Write format marker
if ($this->_targetFormatVersion == Zend_Search_lucene::FORMAT_2_1) {
$newSegmentFile->writeInt((int)0xFFFFFFFD);
} else if ($this->_targetFormatVersion == Zend_Search_lucene::FORMAT_2_3) {
$newSegmentFile->writeInt((int)0xFFFFFFFC);
}
// Read src file format identifier
$format = $segmentsFile->readInt();
if ($format == (int)0xFFFFFFFF) {
$srcFormat = Zend_Search_Lucene::FORMAT_PRE_2_1;
} else if ($format == (int)0xFFFFFFFD) {
$srcFormat = Zend_Search_Lucene::FORMAT_2_1;
} else if ($format == (int)0xFFFFFFFC) {
$srcFormat = Zend_Search_Lucene::FORMAT_2_3;
} else {
throw new Zend_Search_Lucene_Exception('Unsupported segments file format');
}
// $version = $segmentsFile->readLong() + $this->_versionUpdate;
// Process version on 32-bit platforms
$versionHigh = $segmentsFile->readInt();
$versionLow = $segmentsFile->readInt();
$version = $versionHigh * ((double)0xFFFFFFFF + 1) +
(($versionLow < 0)? (double)0xFFFFFFFF - (-1 - $versionLow) : $versionLow);
$version += $this->_versionUpdate;
$this->_versionUpdate = 0;
$newSegmentFile->writeInt((int)($version/((double)0xFFFFFFFF + 1)));
$newSegmentFile->writeInt((int)($version & 0xFFFFFFFF));
// Write segment name counter
$newSegmentFile->writeInt($segmentsFile->readInt());
// Get number of segments offset
$numOfSegmentsOffset = $newSegmentFile->tell();
// Write dummy data (segment counter)
$newSegmentFile->writeInt(0);
// Read number of segemnts
$segmentsCount = $segmentsFile->readInt();
$segments = array();
for ($count = 0; $count < $segmentsCount; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
if ($srcFormat == Zend_Search_Lucene::FORMAT_PRE_2_1) {
// pre-2.1 index format
$delGenHigh = 0;
$delGenLow = 0;
$hasSingleNormFile = false;
$numField = (int)0xFFFFFFFF;
$isCompoundByte = 0;
$docStoreOptions = null;
} else {
//$delGen = $segmentsFile->readLong();
$delGenHigh = $segmentsFile->readInt();
$delGenLow = $segmentsFile->readInt();
if ($srcFormat == Zend_Search_Lucene::FORMAT_2_3) {
$docStoreOffset = $segmentsFile->readInt();
if ($docStoreOffset != -1) {
$docStoreSegment = $segmentsFile->readString();
$docStoreIsCompoundFile = $segmentsFile->readByte();
$docStoreOptions = array('offset' => $docStoreOffset,
'segment' => $docStoreSegment,
'isCompound' => ($docStoreIsCompoundFile == 1));
} else {
$docStoreOptions = null;
}
} else {
$docStoreOptions = null;
}
$hasSingleNormFile = $segmentsFile->readByte();
$numField = $segmentsFile->readInt();
$normGens = array();
if ($numField != (int)0xFFFFFFFF) {
for ($count1 = 0; $count1 < $numField; $count1++) {
$normGens[] = $segmentsFile->readLong();
}
}
$isCompoundByte = $segmentsFile->readByte();
}
if (!in_array($segName, $this->_segmentsToDelete)) {
// Load segment if necessary
if (!isset($this->_segmentInfos[$segName])) {
if (PHP_INT_SIZE > 4) {
// 64-bit system
$delGen = $delGenHigh << 32 |
$delGenLow;
} else {
$delGen = $delGenHigh * ((double)0xFFFFFFFF + 1) +
(($delGenLow < 0)? (double)0xFFFFFFFF - (-1 - $delGenLow) : $delGenLow);
}
if ($isCompoundByte == 0xFF) {
// The segment is not a compound file
$isCompound = false;
} else if ($isCompoundByte == 0x00) {
// The status is unknown
$isCompound = null;
} else if ($isCompoundByte == 0x01) {
// The segment is a compound file
$isCompound = true;
}
$this->_segmentInfos[$segName] =
new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$segName,
$segSize,
$delGen,
$docStoreOptions,
$hasSingleNormFile,
$isCompound);
} else {
// Retrieve actual deletions file generation number
$delGen = $this->_segmentInfos[$segName]->getDelGen();
if ($delGen >= 0) {
if (PHP_INT_SIZE > 4) {
// 64-bit system
$delGenHigh = $delGen >> 32 & 0xFFFFFFFF;
$delGenLow = $delGen & 0xFFFFFFFF;
} else {
$delGenHigh = (int)($delGen/((double)0xFFFFFFFF + 1));
$delGenLow =(int)($delGen & 0xFFFFFFFF);
}
} else {
$delGenHigh = $delGenLow = (int)0xFFFFFFFF;
}
}
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segSize);
$newSegmentFile->writeInt($delGenHigh);
$newSegmentFile->writeInt($delGenLow);
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
if ($docStoreOptions !== null) {
$newSegmentFile->writeInt($docStoreOffset);
$newSegmentFile->writeString($docStoreSegment);
$newSegmentFile->writeByte($docStoreIsCompoundFile);
} else {
// Set DocStoreOffset to -1
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
} else if ($docStoreOptions !== null) {
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
throw new Zend_Search_Lucene_Exception('Index conversion to lower format version is not supported.');
}
$newSegmentFile->writeByte($hasSingleNormFile);
$newSegmentFile->writeInt($numField);
if ($numField != (int)0xFFFFFFFF) {
foreach ($normGens as $normGen) {
$newSegmentFile->writeLong($normGen);
}
}
$newSegmentFile->writeByte($isCompoundByte);
$segments[$segName] = $segSize;
}
}
$segmentsFile->close();
$segmentsCount = count($segments) + count($this->_newSegments);
foreach ($this->_newSegments as $segName => $segmentInfo) {
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segmentInfo->count());
// delete file generation: -1 (there is no delete file yet)
$newSegmentFile->writeInt((int)0xFFFFFFFF);$newSegmentFile->writeInt((int)0xFFFFFFFF);
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
// docStoreOffset: -1 (segment doesn't use shared doc store)
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
// HasSingleNormFile
$newSegmentFile->writeByte($segmentInfo->hasSingleNormFile());
// NumField
$newSegmentFile->writeInt((int)0xFFFFFFFF);
// IsCompoundFile
$newSegmentFile->writeByte($segmentInfo->isCompound() ? 1 : -1);
$segments[$segmentInfo->getName()] = $segmentInfo->count();
$this->_segmentInfos[$segName] = $segmentInfo;
}
$this->_newSegments = array();
$newSegmentFile->seek($numOfSegmentsOffset);
$newSegmentFile->writeInt($segmentsCount); // Update segments count
$newSegmentFile->close();
} catch (Exception $e) {
/** Restore previous index generation */
$generation--;
$genFile->seek(4, SEEK_SET);
// Write generation number twice
$genFile->writeLong($generation); $genFile->writeLong($generation);
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
// Throw the exception
throw $e;
}
// Write generation (second copy)
$genFile->writeLong($generation);
// Check if another update or read process is not running now
// If yes, skip clean-up procedure
if (Zend_Search_Lucene_LockManager::escalateReadLock($this->_directory)) {
/**
* Clean-up directory
*/
$filesToDelete = array();
$filesTypes = array();
$filesNumbers = array();
// list of .del files of currently used segments
// each segment can have several generations of .del files
// only last should not be deleted
$delFiles = array();
foreach ($this->_directory->fileList() as $file) {
if ($file == 'deletable') {
// 'deletable' file
$filesToDelete[] = $file;
$filesTypes[] = 0; // delete this file first, since it's not used starting from Lucene v2.1
$filesNumbers[] = 0;
} else if ($file == 'segments') {
// 'segments' file
$filesToDelete[] = $file;
$filesTypes[] = 1; // second file to be deleted "zero" version of segments file (Lucene pre-2.1)
$filesNumbers[] = 0;
} else if (preg_match('/^segments_[a-zA-Z0-9]+$/i', $file)) {
// 'segments_xxx' file
// Check if it's not a just created generation file
if ($file != Zend_Search_Lucene::getSegmentFileName($generation)) {
$filesToDelete[] = $file;
$filesTypes[] = 2; // first group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 9), 36, 10); // ordered by segment generation numbers
}
} else if (preg_match('/(^_([a-zA-Z0-9]+))\.f\d+$/i', $file, $matches)) {
// one of per segment files ('<segment_name>.f<decimal_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
}
} else if (preg_match('/(^_([a-zA-Z0-9]+))(_([a-zA-Z0-9]+))\.del$/i', $file, $matches)) {
// one of per segment files ('<segment_name>_<del_generation>.del' where <segment_name> is '_<segment_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
} else {
$segmentNumber = (int)base_convert($matches[2], 36, 10);
$delGeneration = (int)base_convert($matches[4], 36, 10);
if (!isset($delFiles[$segmentNumber])) {
$delFiles[$segmentNumber] = array();
}
$delFiles[$segmentNumber][$delGeneration] = $file;
}
} else if (isset(self::$_indexExtensions[substr($file, strlen($file)-4)])) {
// one of per segment files ('<segment_name>.<ext>')
$segmentName = substr($file, 0, strlen($file) - 4);
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$segmentName]) &&
($this->_currentSegment === null || $this->_currentSegment->getName() != $segmentName)) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 1 /* skip '_' */, strlen($file)-5), 36, 10); // order by segment number
}
}
}
$maxGenNumber = 0;
// process .del files of currently used segments
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
ksort($delFiles[$segmentNumber], SORT_NUMERIC);
array_pop($delFiles[$segmentNumber]); // remove last delete file generation from candidates for deleting
end($delFiles[$segmentNumber]);
$lastGenNumber = key($delFiles[$segmentNumber]);
if ($lastGenNumber > $maxGenNumber) {
$maxGenNumber = $lastGenNumber;
}
}
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
foreach ($segmentDelFiles as $delGeneration => $file) {
$filesToDelete[] = $file;
$filesTypes[] = 4; // third group of files for deletions
$filesNumbers[] = $segmentNumber*$maxGenNumber + $delGeneration; // order by <segment_number>,<del_generation> pair
}
}
// Reorder files for deleting
array_multisort($filesTypes, SORT_ASC, SORT_NUMERIC,
$filesNumbers, SORT_ASC, SORT_NUMERIC,
$filesToDelete, SORT_ASC, SORT_STRING);
foreach ($filesToDelete as $file) {
try {
/** Skip shared docstore segments deleting */
/** @todo Process '.cfx' files to check if them are already unused */
if (substr($file, strlen($file)-4) != '.cfx') {
$this->_directory->deleteFile($file);
}
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'Can\'t delete file') === false) {
// That's not "file is under processing or already deleted" exception
// Pass it through
throw $e;
}
}
}
// Return read lock into the previous state
Zend_Search_Lucene_LockManager::deEscalateReadLock($this->_directory);
} else {
// Only release resources if another index reader is running now
foreach ($this->_segmentsToDelete as $segName) {
foreach (self::$_indexExtensions as $ext) {
$this->_directory->purgeFile($segName . $ext);
}
}
}
// Clean-up _segmentsToDelete container
$this->_segmentsToDelete = array();
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
// Remove unused segments from segments list
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if (!isset($segments[$segName])) {
unset($this->_segmentInfos[$segName]);
}
}
}
/**
* Commit current changes
*/
public function commit()
{
if ($this->_currentSegment !== null) {
$newSegment = $this->_currentSegment->close();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->_currentSegment = null;
}
$this->_updateSegments();
}
/**
* Merges the provided indexes into this index.
*
* @param array $readers
* @return void
*/
public function addIndexes($readers)
{
/**
* @todo implementation
*/
}
/**
* Merges all segments together into new one
*
* Returns true on success and false if another optimization or auto-optimization process
* is running now
*
* @return boolean
*/
public function optimize()
{
if (Zend_Search_Lucene_LockManager::obtainOptimizationLock($this->_directory) === false) {
return false;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
$this->_mergeSegments($this->_segmentInfos);
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return true;
}
/**
* Get name for new segment
*
* @return string
*/
private function _newSegmentName()
{
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
$generation = Zend_Search_Lucene::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Zend_Search_Lucene::getSegmentFileName($generation), false);
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentNameCounter = $segmentsFile->readInt();
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentsFile->writeInt($segmentNameCounter + 1);
// Flash output to guarantee that wrong value will not be loaded between unlock and
// return (which calls $segmentsFile destructor)
$segmentsFile->flush();
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
return '_' . base_convert($segmentNameCounter, 10, 36);
}
}
Lucene/Index/Term.php 0000666 00000007343 15125712134 0010462 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Term represents a word from text. This is the unit of search. It is
* composed of two elements, the text of the word, as a string, and the name of
* the field that the text occured in, an interned string.
*
* Note that terms may represent more than words from text fields, but also
* things like dates, email addresses, urls, etc.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Term
{
/**
* Field name or field number (depending from context)
*
* @var mixed
*/
public $field;
/**
* Term value
*
* @var string
*/
public $text;
/**
* Object constructor
*/
public function __construct($text, $field = null)
{
$this->field = ($field === null)? Zend_Search_Lucene::getDefaultSearchField() : $field;
$this->text = $text;
}
/**
* Returns term key
*
* @return string
*/
public function key()
{
return $this->field . chr(0) . $this->text;
}
/**
* Get term prefix
*
* @param string $str
* @param integer $length
* @return string
*/
public static function getPrefix($str, $length)
{
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < strlen($str) && $prefixChars < $length) {
$charBytes = 1;
if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
return substr($str, 0, $prefixBytes);
}
/**
* Get UTF-8 string length
*
* @param string $str
* @return string
*/
public static function getLength($str)
{
$bytes = 0;
$chars = 0;
while ($bytes < strlen($str)) {
$charBytes = 1;
if ((ord($str[$bytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$bytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$bytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($bytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$chars++;
$bytes += $charBytes;
}
return $chars;
}
}
Lucene/Index/DictionaryLoader.php 0000666 00000024671 15125712134 0013012 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Dictionary loader
*
* It's a dummy class which is created to encapsulate non-good structured code.
* Manual "method inlining" is performed to increase dictionary index loading operation
* which is major bottelneck for search performance.
*
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_DictionaryLoader
{
/**
* Dictionary index loader.
*
* It takes a string which is actually <segment_name>.tii index file data and
* returns two arrays - term and tremInfo lists.
*
* See Zend_Search_Lucene_Index_SegmintInfo class for details
*
* @param string $data
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public static function load($data)
{
$termDictionary = array();
$termInfos = array();
$pos = 0;
// $tiVersion = $tiiFile->readInt();
$tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
$pos += 4;
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
}
// $indexTermCount = $tiiFile->readLong();
if (PHP_INT_SIZE > 4) {
$indexTermCount = ord($data[$pos]) << 56 |
ord($data[$pos+1]) << 48 |
ord($data[$pos+2]) << 40 |
ord($data[$pos+3]) << 32 |
ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
} else {
if ((ord($data[$pos]) != 0) ||
(ord($data[$pos+1]) != 0) ||
(ord($data[$pos+2]) != 0) ||
(ord($data[$pos+3]) != 0) ||
((ord($data[$pos+4]) & 0x80) != 0)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
}
$indexTermCount = ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
}
$pos += 8;
// $tiiFile->readInt(); // IndexInterval
$pos += 4;
// $skipInterval = $tiiFile->readInt();
$skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
$pos += 4;
if ($indexTermCount < 1) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
}
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
/* Skip MaxSkipLevels value */
$pos += 4;
}
$prevTerm = '';
$freqPointer = 0;
$proxPointer = 0;
$indexPointer = 0;
for ($count = 0; $count < $indexTermCount; $count++) {
//$termPrefixLength = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termPrefixLength = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termPrefixLength |= ($nbyte & 0x7F) << $shift;
}
// $termSuffix = $tiiFile->readString();
$nbyte = ord($data[$pos++]);
$len = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$len |= ($nbyte & 0x7F) << $shift;
}
if ($len == 0) {
$termSuffix = '';
} else {
$termSuffix = substr($data, $pos, $len);
$pos += $len;
for ($count1 = 0; $count1 < $len; $count1++ ) {
if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($termSuffix[$count1]) & 0x20 ) {
$addBytes++;
// Never used for Java Lucene created index.
// Java2 doesn't encode strings in four bytes
if (ord($termSuffix[$count1]) & 0x10 ) {
$addBytes++;
}
}
$termSuffix .= substr($data, $pos, $addBytes);
$pos += $addBytes;
$len += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($termSuffix[$count1]) == 0xC0 &&
ord($termSuffix[$count1+1]) == 0x80 ) {
$termSuffix[$count1] = 0;
$termSuffix = substr($termSuffix,0,$count1+1)
. substr($termSuffix,$count1+2);
}
$count1 += $addBytes;
}
}
}
// $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
$pb = 0; $pc = 0;
while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
$charBytes = 1;
if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x20 ) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x10 ) {
$charBytes++;
}
}
}
if ($pb + $charBytes > strlen($data)) {
// wrong character
break;
}
$pc++;
$pb += $charBytes;
}
$termValue = substr($prevTerm, 0, $pb) . $termSuffix;
// $termFieldNum = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termFieldNum = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termFieldNum |= ($nbyte & 0x7F) << $shift;
}
// $docFreq = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$docFreq = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$docFreq |= ($nbyte & 0x7F) << $shift;
}
// $freqPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$freqPointer += $vint;
// $proxPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$proxPointer += $vint;
if( $docFreq >= $skipInterval ) {
// $skipDelta = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$skipDelta = $vint;
} else {
$skipDelta = 0;
}
// $indexPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$indexPointer += $vint;
// $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
$termDictionary[] = array($termFieldNum, $termValue);
$termInfos[] =
// new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
$prevTerm = $termValue;
}
// Check special index entry mark
if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
} else if (PHP_INT_SIZE > 4){
// Treat 64-bit 0xFFFFFFFF as -1
$termDictionary[0][0] = -1;
}
return array(&$termDictionary, &$termInfos);
}
}
Lucene/Index/SegmentWriter/DocumentWriter.php 0000666 00000017641 15125712134 0015327 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Term Dictionary
* Array of the Zend_Search_Lucene_Index_Term objects
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
protected $_termDictionary;
/**
* Documents, which contain the term
*
* @var array
*/
protected $_termDocs;
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
parent::__construct($directory, $name);
$this->_termDocs = array();
$this->_termDictionary = array();
}
/**
* Adds a document to this segment.
*
* @param Zend_Search_Lucene_Document $document
* @throws Zend_Search_Lucene_Exception
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
$storedFields = array();
$docNorms = array();
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
foreach ($document->getFieldNames() as $fieldName) {
$field = $document->getField($fieldName);
$this->addField($field);
if ($field->storeTermVector) {
/**
* @todo term vector storing support
*/
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
}
if ($field->isIndexed) {
if ($field->isTokenized) {
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$analyzer->setInput($field->value, $field->encoding);
$position = 0;
$tokenCounter = 0;
while (($token = $analyzer->nextToken()) !== null) {
$tokenCounter++;
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$position += $token->getPositionIncrement();
$this->_termDocs[$termKey][$this->_docCount][] = $position;
}
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
$tokenCounter)*
$document->boost*
$field->boost ));
} else {
$term = new Zend_Search_Lucene_Index_Term($field->getUtf8Value(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$this->_termDocs[$termKey][$this->_docCount][] = 0; // position
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
$document->boost*
$field->boost ));
}
}
if ($field->isStored) {
$storedFields[] = $field;
}
}
foreach ($this->_fields as $fieldName => $field) {
if (!$field->isIndexed) {
continue;
}
if (!isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (isset($docNorms[$fieldName])){
$this->_norms[$fieldName] .= $docNorms[$fieldName];
} else {
$this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
}
}
$this->addStoredFields($storedFields);
}
/**
* Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
*/
protected function _dumpDictionary()
{
ksort($this->_termDictionary, SORT_STRING);
$this->initializeDictionaryFiles();
foreach ($this->_termDictionary as $termId => $term) {
$this->addTerm($term, $this->_termDocs[$termId]);
}
$this->closeDictionaryFiles();
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_dumpDictionary();
$this->_generateCFS();
return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}
Lucene/Index/SegmentWriter/StreamWriter.php 0000666 00000006000 15125712134 0014767 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter_StreamWriter extends Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
parent::__construct($directory, $name);
}
/**
* Create stored fields files and open them for write
*/
public function createStoredFieldsFiles()
{
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
public function addNorm($fieldName, $normVector)
{
if (isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] .= $normVector;
} else {
$this->_norms[$fieldName] = $normVector;
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_generateCFS();
return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}
Lucene/Index/TermInfo.php 0000666 00000004122 15125712134 0011266 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_TermInfo
{
/**
* The number of documents which contain the term.
*
* @var integer
*/
public $docFreq;
/**
* Data offset in a Frequencies file.
*
* @var integer
*/
public $freqPointer;
/**
* Data offset in a Positions file.
*
* @var integer
*/
public $proxPointer;
/**
* ScipData offset in a Frequencies file.
*
* @var integer
*/
public $skipOffset;
/**
* Term offset of the _next_ term in a TermDictionary file.
* Used only for Term Index
*
* @var integer
*/
public $indexPointer;
public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null)
{
$this->docFreq = $docFreq;
$this->freqPointer = $freqPointer;
$this->proxPointer = $proxPointer;
$this->skipOffset = $skipOffset;
$this->indexPointer = $indexPointer;
}
}
Lucene/Storage/File/Filesystem.php 0000666 00000013560 15125712134 0013111 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Storage_File */
require_once 'Zend/Search/Lucene/Storage/File.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Storage_File
{
/**
* Resource of the open file
*
* @var resource
*/
protected $_fileHandle;
/**
* Class constructor. Open the file.
*
* @param string $filename
* @param string $mode
*/
public function __construct($filename, $mode='r+b')
{
global $php_errormsg;
if (strpos($mode, 'w') === false && !is_readable($filename)) {
// opening for reading non-readable file
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('File \'' . $filename . '\' is not readable.');
}
$trackErrors = ini_get('track_errors');
ini_set('track_errors', '1');
$this->_fileHandle = @fopen($filename, $mode);
if ($this->_fileHandle === false) {
ini_set('track_errors', $trackErrors);
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($php_errormsg);
}
ini_set('track_errors', $trackErrors);
}
/**
* Sets the file position indicator and advances the file pointer.
* The new position, measured in bytes from the beginning of the file,
* is obtained by adding offset to the position specified by whence,
* whose values are defined as follows:
* SEEK_SET - Set position equal to offset bytes.
* SEEK_CUR - Set position to current location plus offset.
* SEEK_END - Set position to end-of-file plus offset. (To move to
* a position before the end-of-file, you need to pass a negative value
* in offset.)
* SEEK_CUR is the only supported offset type for compound files
*
* Upon success, returns 0; otherwise, returns -1
*
* @param integer $offset
* @param integer $whence
* @return integer
*/
public function seek($offset, $whence=SEEK_SET)
{
return fseek($this->_fileHandle, $offset, $whence);
}
/**
* Get file position.
*
* @return integer
*/
public function tell()
{
return ftell($this->_fileHandle);
}
/**
* Flush output.
*
* Returns true on success or false on failure.
*
* @return boolean
*/
public function flush()
{
return fflush($this->_fileHandle);
}
/**
* Close File object
*/
public function close()
{
if ($this->_fileHandle !== null ) {
@fclose($this->_fileHandle);
$this->_fileHandle = null;
}
}
/**
* Get the size of the already opened file
*
* @return integer
*/
public function size()
{
$position = ftell($this->_fileHandle);
fseek($this->_fileHandle, 0, SEEK_END);
$size = ftell($this->_fileHandle);
fseek($this->_fileHandle,$position);
return $size;
}
/**
* Read a $length bytes from the file and advance the file pointer.
*
* @param integer $length
* @return string
*/
protected function _fread($length=1)
{
if ($length == 0) {
return '';
}
if ($length < 1024) {
return fread($this->_fileHandle, $length);
}
$data = '';
while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) {
$data .= $nextBlock;
$length -= strlen($nextBlock);
}
return $data;
}
/**
* Writes $length number of bytes (all, if $length===null) to the end
* of the file.
*
* @param string $data
* @param integer $length
*/
protected function _fwrite($data, $length=null)
{
if ($length === null ) {
fwrite($this->_fileHandle, $data);
} else {
fwrite($this->_fileHandle, $data, $length);
}
}
/**
* Lock file
*
* Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock)
*
* @param integer $lockType
* @param boolean $nonBlockingLock
* @return boolean
*/
public function lock($lockType, $nonBlockingLock = false)
{
if ($nonBlockingLock) {
return flock($this->_fileHandle, $lockType | LOCK_NB);
} else {
return flock($this->_fileHandle, $lockType);
}
}
/**
* Unlock file
*
* Returns true on success
*
* @return boolean
*/
public function unlock()
{
if ($this->_fileHandle !== null ) {
return flock($this->_fileHandle, LOCK_UN);
} else {
return true;
}
}
}
Lucene/Storage/Directory/Filesystem.php 0000666 00000024475 15125712134 0014205 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Storage_Directory */
require_once 'Zend/Search/Lucene/Storage/Directory.php';
/** Zend_Search_Lucene_Storage_File_Filesystem */
require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php';
/**
* FileSystem implementation of Directory abstraction.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene_Storage_Directory
{
/**
* Filesystem path to the directory
*
* @var string
*/
protected $_dirPath = null;
/**
* Cache for Zend_Search_Lucene_Storage_File_Filesystem objects
* Array: filename => Zend_Search_Lucene_Storage_File object
*
* @var array
* @throws Zend_Search_Lucene_Exception
*/
protected $_fileHandlers;
/**
* Default file permissions
*
* @var integer
*/
protected static $_defaultFilePermissions = 0666;
/**
* Get default file permissions
*
* @return integer
*/
public static function getDefaultFilePermissions()
{
return self::$_defaultFilePermissions;
}
/**
* Set default file permissions
*
* @param integer $mode
*/
public static function setDefaultFilePermissions($mode)
{
self::$_defaultFilePermissions = $mode;
}
/**
* Utility function to recursive directory creation
*
* @param string $dir
* @param integer $mode
* @param boolean $recursive
* @return boolean
*/
public static function mkdirs($dir, $mode = 0777, $recursive = true)
{
if (is_null($dir) || $dir === '') {
return false;
}
if (is_dir($dir) || $dir === '/') {
return true;
}
if (self::mkdirs(dirname($dir), $mode, $recursive)) {
return mkdir($dir, $mode);
}
return false;
}
/**
* Object constructor
* Checks if $path is a directory or tries to create it.
*
* @param string $path
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($path)
{
if (!is_dir($path)) {
if (file_exists($path)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory');
} else {
if (!self::mkdirs($path)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Can't create directory '$path'.");
}
}
}
$this->_dirPath = $path;
$this->_fileHandlers = array();
}
/**
* Closes the store.
*
* @return void
*/
public function close()
{
foreach ($this->_fileHandlers as $fileObject) {
$fileObject->close();
}
$this->_fileHandlers = array();
}
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
public function fileList()
{
$result = array();
$dirContent = opendir( $this->_dirPath );
while (($file = readdir($dirContent)) !== false) {
if (($file == '..')||($file == '.')) continue;
if( !is_dir($this->_dirPath . '/' . $file) ) {
$result[] = $file;
}
}
closedir($dirContent);
return $result;
}
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
* @throws Zend_Search_Lucene_Exception
*/
public function createFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
$this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b');
// Set file permissions, but don't care about any possible failures, since file may be already
// created by anther user which has to care about right permissions
@chmod($this->_dirPath . '/' . $filename, self::$_defaultFilePermissions);
return $this->_fileHandlers[$filename];
}
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @return void
* @throws Zend_Search_Lucene_Exception
*/
public function deleteFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
global $php_errormsg;
$trackErrors = ini_get('track_errors'); ini_set('track_errors', '1');
if (!@unlink($this->_dirPath . '/' . $filename)) {
ini_set('track_errors', $trackErrors);
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Can\'t delete file: ' . $php_errormsg);
}
ini_set('track_errors', $trackErrors);
}
/**
* Purge file if it's cached by directory object
*
* Method is used to prevent 'too many open files' error
*
* @param string $filename
* @return void
*/
public function purgeFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
}
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
public function fileExists($filename)
{
return isset($this->_fileHandlers[$filename]) ||
file_exists($this->_dirPath . '/' . $filename);
}
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
public function fileLength($filename)
{
if (isset( $this->_fileHandlers[$filename] )) {
return $this->_fileHandlers[$filename]->size();
}
return filesize($this->_dirPath .'/'. $filename);
}
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
public function fileModified($filename)
{
return filemtime($this->_dirPath .'/'. $filename);
}
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @return void
* @throws Zend_Search_Lucene_Exception
*/
public function renameFile($from, $to)
{
global $php_errormsg;
if (isset($this->_fileHandlers[$from])) {
$this->_fileHandlers[$from]->close();
}
unset($this->_fileHandlers[$from]);
if (isset($this->_fileHandlers[$to])) {
$this->_fileHandlers[$to]->close();
}
unset($this->_fileHandlers[$to]);
if (file_exists($this->_dirPath . '/' . $to)) {
if (!unlink($this->_dirPath . '/' . $to)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Delete operation failed');
}
}
$trackErrors = ini_get('track_errors');
ini_set('track_errors', '1');
$success = @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to);
if (!$success) {
ini_set('track_errors', $trackErrors);
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($php_errormsg);
}
ini_set('track_errors', $trackErrors);
return $success;
}
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
public function touchFile($filename)
{
return touch($this->_dirPath .'/'. $filename);
}
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* If $shareHandler option is true, then file handler can be shared between File Object
* requests. It speed-ups performance, but makes problems with file position.
* Shared handler are good for short atomic requests.
* Non-shared handlers are useful for stream file reading (especial for compound files).
*
* @param string $filename
* @param boolean $shareHandler
* @return Zend_Search_Lucene_Storage_File
*/
public function getFileObject($filename, $shareHandler = true)
{
$fullFilename = $this->_dirPath . '/' . $filename;
if (!$shareHandler) {
return new Zend_Search_Lucene_Storage_File_Filesystem($fullFilename);
}
if (isset( $this->_fileHandlers[$filename] )) {
$this->_fileHandlers[$filename]->seek(0);
return $this->_fileHandlers[$filename];
}
$this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($fullFilename);
return $this->_fileHandlers[$filename];
}
}
Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php 0000666 00000003004 15125712134 0017772 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum
{
public function __construct()
{
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
}
}
Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php 0000666 00000002770 15125712134 0017323 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Text
{
public function __construct()
{
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
}
}
Lucene/Analysis/Analyzer/Common/Text.php 0000666 00000005245 15125712134 0014227 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current position in a stream
*
* @var integer
*/
private $_position;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
if ($this->_input === null) {
return;
}
// convert input into ascii
if (PHP_OS != 'AIX') {
$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
}
$this->_encoding = 'ASCII';
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
$str = $match[0][0];
$pos = $match[0][1];
$endpos = $pos + strlen($str);
$this->_position = $endpos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
} while ($token === null); // try again if token is skipped
return $token;
}
}
Lucene/Analysis/Analyzer/Common/Utf8Num.php 0000666 00000007774 15125712134 0014622 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Object constructor
*
* @throws Zend_Search_Lucene_Exception
*/
public function __construct()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Utf8Num analyzer needs PCRE unicode support to be enabled.');
}
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
// matched string
$matchedWord = $match[0][0];
// binary position of the matched word in the input stream
$binStartPos = $match[0][1];
// character position of the matched word in the input stream
$startPos = $this->_position +
iconv_strlen(substr($this->_input,
$this->_bytePosition,
$binStartPos - $this->_bytePosition),
'UTF-8');
// character postion of the end of matched word in the input stream
$endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
$this->_bytePosition = $binStartPos + strlen($matchedWord);
$this->_position = $endPos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
} while ($token === null); // try again if token is skipped
return $token;
}
}
Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php 0000666 00000003047 15125712134 0017223 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8
{
public function __construct()
{
parent::__construct();
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8());
}
}
Lucene/Analysis/Analyzer/Common/TextNum.php 0000666 00000005251 15125712134 0014704 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current position in a stream
*
* @var integer
*/
private $_position;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
if ($this->_input === null) {
return;
}
// convert input into ascii
if (PHP_OS != 'AIX') {
$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
}
$this->_encoding = 'ASCII';
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
$str = $match[0][0];
$pos = $match[0][1];
$endpos = $pos + strlen($str);
$this->_position = $endpos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
} while ($token === null); // try again if token is skipped
return $token;
}
}
Lucene/Analysis/Analyzer/Common/Utf8.php 0000666 00000007765 15125712134 0014142 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Object constructor
*
* @throws Zend_Search_Lucene_Exception
*/
public function __construct()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.');
}
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
// matched string
$matchedWord = $match[0][0];
// binary position of the matched word in the input stream
$binStartPos = $match[0][1];
// character position of the matched word in the input stream
$startPos = $this->_position +
iconv_strlen(substr($this->_input,
$this->_bytePosition,
$binStartPos - $this->_bytePosition),
'UTF-8');
// character postion of the end of matched word in the input stream
$endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
$this->_bytePosition = $binStartPos + strlen($matchedWord);
$this->_position = $endPos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
} while ($token === null); // try again if token is skipped
return $token;
}
}
Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php 0000666 00000003062 15125712134 0017700 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num
{
public function __construct()
{
parent::__construct();
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8());
}
}
Lucene/Analysis/Analyzer/Common.php 0000666 00000005102 15125712134 0013273 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/**
* Common implementation of the Zend_Search_Lucene_Analysis_Analyzer interface.
* There are several standard standard subclasses provided by Zend_Search_Lucene/Analysis
* subpackage: Zend_Search_Lucene_Analysis_Analyzer_Common_Text, ZSearchHTMLAnalyzer, ZSearchXMLAnalyzer.
*
* @todo ZSearchHTMLAnalyzer and ZSearchXMLAnalyzer implementation
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The set of Token filters applied to the Token stream.
* Array of Zend_Search_Lucene_Analysis_TokenFilter objects.
*
* @var array
*/
private $_filters = array();
/**
* Add Token filter to the Analyzer
*
* @param Zend_Search_Lucene_Analysis_TokenFilter $filter
*/
public function addFilter(Zend_Search_Lucene_Analysis_TokenFilter $filter)
{
$this->_filters[] = $filter;
}
/**
* Apply filters to the token. Can return null when the token was removed.
*
* @param Zend_Search_Lucene_Analysis_Token $token
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $token)
{
foreach ($this->_filters as $filter) {
$token = $filter->normalize($token);
// resulting token can be null if the filter removes it
if (is_null($token)) {
return null;
}
}
return $token;
}
}
Lucene/Analysis/Analyzer.php 0000666 00000012351 15125712134 0012047 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
/**
* An Analyzer is used to analyze text.
* It thus represents a policy for extracting index terms from text.
*
* Note:
* Lucene Java implementation is oriented to streams. It provides effective work
* with a huge documents (more then 20Mb).
* But engine itself is not oriented such documents.
* Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The Analyzer implementation used by default.
*
* @var Zend_Search_Lucene_Analysis_Analyzer
*/
private static $_defaultImpl;
/**
* Input string
*
* @var string
*/
protected $_input = null;
/**
* Input string encoding
*
* @var string
*/
protected $_encoding = '';
/**
* Tokenize text to a terms
* Returns array of Zend_Search_Lucene_Analysis_Token objects
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @param string $data
* @return array
*/
public function tokenize($data, $encoding = '')
{
$this->setInput($data, $encoding);
$tokenList = array();
while (($nextToken = $this->nextToken()) !== null) {
$tokenList[] = $nextToken;
}
return $tokenList;
}
/**
* Tokenization stream API
* Set input
*
* @param string $data
*/
public function setInput($data, $encoding = '')
{
$this->_input = $data;
$this->_encoding = $encoding;
$this->reset();
}
/**
* Reset token stream
*/
abstract public function reset();
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
abstract public function nextToken();
/**
* Set the default Analyzer implementation used by indexing code.
*
* @param Zend_Search_Lucene_Analysis_Analyzer $similarity
*/
public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
{
self::$_defaultImpl = $analyzer;
}
/**
* Return the default Analyzer implementation used by indexing code.
*
* @return Zend_Search_Lucene_Analysis_Analyzer
*/
public static function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
}
return self::$_defaultImpl;
}
}
Lucene/Analysis/TokenFilter/LowerCase.php 0000666 00000003574 15125712134 0014403 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
{
$newToken = new Zend_Search_Lucene_Analysis_Token(
strtolower( $srcToken->getTermText() ),
$srcToken->getStartOffset(),
$srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}
Lucene/Analysis/TokenFilter/ShortWords.php 0000666 00000004104 15125712134 0014623 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Token filter that removes short words. What is short word can be configured with constructor.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_ShortWords extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Minimum allowed term length
* @var integer
*/
private $length;
/**
* Constructs new instance of this filter.
*
* @param integer $short minimum allowed length of term which passes this filter (default 2)
*/
public function __construct($length = 2) {
$this->length = $length;
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
if (strlen($srcToken->getTermText()) < $this->length) {
return null;
} else {
return $srcToken;
}
}
}
Lucene/Analysis/TokenFilter/StopWords.php 0000666 00000007031 15125712134 0014453 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Token filter that removes stop words. These words must be provided as array (set), example:
* $stopwords = array('the' => 1, 'an' => '1');
*
* We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_StopWords extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Stop Words
* @var array
*/
private $_stopSet;
/**
* Constructs new instance of this filter.
*
* @param array $stopwords array (set) of words that will be filtered out
*/
public function __construct($stopwords = array()) {
$this->_stopSet = array_flip($stopwords);
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
return null;
} else {
return $srcToken;
}
}
/**
* Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first
* column are ignored (as comments).
*
* You can call this method one or more times. New stopwords are always added to current set.
*
* @param string $filepath full path for text file with stopwords
* @throws Zend_Search_Exception When the file doesn`t exists or is not readable.
*/
public function loadFromFile($filepath = null) {
if (! $filepath || ! file_exists($filepath)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('You have to provide valid file path');
}
$fd = fopen($filepath, "r");
if (! $fd) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Cannot open file ' . $filepath);
}
while (!feof ($fd)) {
$buffer = trim(fgets($fd));
if (strlen($buffer) > 0 && $buffer[0] != '#') {
$this->_stopSet[$buffer] = 1;
}
}
if (!fclose($fd)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Cannot close file ' . $filepath);
}
}
}
Lucene/Analysis/TokenFilter/LowerCaseUtf8.php 0000666 00000004432 15125712134 0015144 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Object constructor
*/
public function __construct()
{
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Utf8 compatible lower case filter needs mbstring extension to be enabled.');
}
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
{
$newToken = new Zend_Search_Lucene_Analysis_Token(
mb_strtolower($srcToken->getTermText(), 'UTF-8'),
$srcToken->getStartOffset(),
$srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}
Lucene/Analysis/Token.php 0000666 00000010353 15125712134 0011342 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Token
{
/**
* The text of the term.
*
* @var string
*/
private $_termText;
/**
* Start in source text.
*
* @var integer
*/
private $_startOffset;
/**
* End in source text
*
* @var integer
*/
private $_endOffset;
/**
* The position of this token relative to the previous Token.
*
* The default value is one.
*
* Some common uses for this are:
* Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* @var integer
*/
private $_positionIncrement;
/**
* Object constructor
*
* @param string $text
* @param integer $start
* @param integer $end
* @param string $type
*/
public function __construct($text, $start, $end)
{
$this->_termText = $text;
$this->_startOffset = $start;
$this->_endOffset = $end;
$this->_positionIncrement = 1;
}
/**
* positionIncrement setter
*
* @param integer $positionIncrement
*/
public function setPositionIncrement($positionIncrement)
{
$this->_positionIncrement = $positionIncrement;
}
/**
* Returns the position increment of this Token.
*
* @return integer
*/
public function getPositionIncrement()
{
return $this->_positionIncrement;
}
/**
* Returns the Token's term text.
*
* @return string
*/
public function getTermText()
{
return $this->_termText;
}
/**
* Returns this Token's starting offset, the position of the first character
* corresponding to this token in the source text.
*
* Note:
* The difference between getEndOffset() and getStartOffset() may not be equal
* to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered
* by a stemmer or some other filter.
*
* @return integer
*/
public function getStartOffset()
{
return $this->_startOffset;
}
/**
* Returns this Token's ending offset, one greater than the position of the
* last character corresponding to this token in the source text.
*
* @return integer
*/
public function getEndOffset()
{
return $this->_endOffset;
}
}
Lucene/Analysis/TokenFilter.php 0000666 00000002764 15125712134 0012517 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/**
* Token filter converts (normalizes) Token ore removes it from a token stream.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
abstract public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken);
}
Lucene/Document/Xlsx.php 0000666 00000022153 15125712134 0011214 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Document_OpenXml */
require_once 'Zend/Search/Lucene/Document/OpenXml.php';
if (class_exists('ZipArchive', false)) {
/**
* Xlsx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Xlsx extends Zend_Search_Lucene_Document_OpenXml
{
/**
* Xml Schema - SpreadsheetML
*
* @var string
*/
const SCHEMA_SPREADSHEETML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/**
* Xml Schema - DrawingML
*
* @var string
*/
const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
/**
* Xml Schema - Shared Strings
*
* @var string
*/
const SCHEMA_SHAREDSTRINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings';
/**
* Xml Schema - Worksheet relation
*
* @var string
*/
const SCHEMA_WORKSHEETRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet';
/**
* Xml Schema - Slide notes relation
*
* @var string
*/
const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
*/
private function __construct($fileName, $storeContent)
{
// Document data holders
$sharedStrings = array();
$worksheets = array();
$documentBody = array();
$coreProperties = array();
// Open OpenXML package
$package = new ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relations = simplexml_load_string($package->getFromName("_rels/.rels"));
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Read relations for workbook...
$workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
$workbookRelations->registerXPathNamespace("rel", Zend_Search_Lucene_Document_OpenXml::SCHEMA_RELATIONSHIP);
// Read shared strings
$sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . Zend_Search_Lucene_Document_Xlsx::SCHEMA_SHAREDSTRINGS . "']");
$sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
$xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
if (isset($xmlStrings) && isset($xmlStrings->si)) {
foreach ($xmlStrings->si as $val) {
if (isset($val->t)) {
$sharedStrings[] = (string)$val->t;
} elseif (isset($val->r)) {
$sharedStrings[] = $this->_parseRichText($val);
}
}
}
// Loop relations for workbook and extract worksheets...
foreach ($workbookRelations->Relationship as $workbookRelation) {
if ($workbookRelation["Type"] == Zend_Search_Lucene_Document_Xlsx::SCHEMA_WORKSHEETRELATION) {
$worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
);
}
}
break;
}
}
// Sort worksheets
ksort($worksheets);
// Extract contents from worksheets
foreach ($worksheets as $sheetKey => $worksheet) {
foreach ($worksheet->sheetData->row as $row) {
foreach ($row->c as $c) {
// Determine data type
$dataType = (string)$c["t"];
switch ($dataType) {
case "s":
// Value is a shared string
if ((string)$c->v != '') {
$value = $sharedStrings[intval($c->v)];
} else {
$value = '';
}
break;
case "b":
// Value is boolean
$value = (string)$c->v;
if ($value == '0') {
$value = false;
} else if ($value == '1') {
$value = true;
} else {
$value = (bool)$c->v;
}
break;
case "inlineStr":
// Value is rich text inline
$value = $this->_parseRichText($c->is);
break;
case "e":
// Value is an error message
if ((string)$c->v != '') {
$value = (string)$c->v;
} else {
$value = '';
}
break;
default:
// Value is a string
$value = (string)$c->v;
// Check for numeric values
if (is_numeric($value) && $dataType != 's') {
if ($value == (int)$value) $value = (int)$value;
elseif ($value == (float)$value) $value = (float)$value;
elseif ($value == (double)$value) $value = (double)$value;
}
}
$documentBody[] = $value;
}
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value)
{
$this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (!isset($coreProperties['title']))
{
$this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Parse rich text XML
*
* @param SimpleXMLElement $is
* @return string
*/
private function _parseRichText($is = null) {
$value = array();
if (isset($is->t)) {
$value[] = (string)$is->t;
} else {
foreach ($is->r as $run) {
$value[] = (string)$run->t;
}
}
return implode('', $value);
}
/**
* Load Xlsx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Xlsx
*/
public static function loadXlsxFile($fileName, $storeContent = false)
{
return new Zend_Search_Lucene_Document_Xlsx($fileName, $storeContent);
}
}
} // end if (class_exists('ZipArchive'))
Lucene/Document/Html.php 0000666 00000023662 15125712134 0011170 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';
/**
* HTML document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
{
/**
* List of document links
*
* @var array
*/
private $_links = array();
/**
* List of document header links
*
* @var array
*/
private $_headerLinks = array();
/**
* Stored DOM representation
*
* @var DOMDocument
*/
private $_doc;
/**
* Exclud nofollow links flag
*
* If true then links with rel='nofollow' attribute are not included into
* document links.
*
* @var boolean
*/
private static $_excludeNoFollowLinks = false;
/**
* Object constructor
*
* @param string $data
* @param boolean $isFile
* @param boolean $storeContent
*/
private function __construct($data, $isFile, $storeContent)
{
$this->_doc = new DOMDocument();
$this->_doc->substituteEntities = true;
if ($isFile) {
$htmlData = file_get_contents($data);
} else {
$htmlData = $data;
}
@$this->_doc->loadHTML($htmlData);
$xpath = new DOMXPath($this->_doc);
$docTitle = '';
$titleNodes = $xpath->query('/html/head/title');
foreach ($titleNodes as $titleNode) {
// title should always have only one entry, but we process all nodeset entries
$docTitle .= $titleNode->nodeValue . ' ';
}
$this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, $this->_doc->actualEncoding));
$metaNodes = $xpath->query('/html/head/meta[@name]');
foreach ($metaNodes as $metaNode) {
$this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
$metaNode->getAttribute('content'),
$this->_doc->actualEncoding));
}
$docBody = '';
$bodyNodes = $xpath->query('/html/body');
foreach ($bodyNodes as $bodyNode) {
// body should always have only one entry, but we process all nodeset entries
$this->_retrieveNodeText($bodyNode, $docBody);
}
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, $this->_doc->actualEncoding));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, $this->_doc->actualEncoding));
}
$linkNodes = $this->_doc->getElementsByTagName('a');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' &&
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
) {
$this->_links[] = $href;
}
}
$this->_links = array_unique($this->_links);
$linkNodes = $xpath->query('/html/head/link');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '') {
$this->_headerLinks[] = $href;
}
}
$this->_headerLinks = array_unique($this->_headerLinks);
}
/**
* Set exclude nofollow links flag
*
* @param boolean $newValue
*/
public static function setExcludeNoFollowLinks($newValue)
{
self::$_excludeNoFollowLinks = $newValue;
}
/**
* Get exclude nofollow links flag
*
* @return boolean
*/
public static function getExcludeNoFollowLinks()
{
return self::$_excludeNoFollowLinks;
}
/**
* Get node text
*
* We should exclude scripts, which may be not included into comment tags, CDATA sections,
*
* @param DOMNode $node
* @param string &$text
*/
private function _retrieveNodeText(DOMNode $node, &$text)
{
if ($node->nodeType == XML_TEXT_NODE) {
$text .= $node->nodeValue ;
$text .= ' ';
} else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') {
foreach ($node->childNodes as $childNode) {
$this->_retrieveNodeText($childNode, $text);
}
}
}
/**
* Get document HREF links
*
* @return array
*/
public function getLinks()
{
return $this->_links;
}
/**
* Get document header links
*
* @return array
*/
public function getHeaderLinks()
{
return $this->_headerLinks;
}
/**
* Load HTML document from a string
*
* @param string $data
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Html
*/
public static function loadHTML($data, $storeContent = false)
{
return new Zend_Search_Lucene_Document_Html($data, false, $storeContent);
}
/**
* Load HTML document from a file
*
* @param string $file
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Html
*/
public static function loadHTMLFile($file, $storeContent = false)
{
return new Zend_Search_Lucene_Document_Html($file, true, $storeContent);
}
/**
* Highlight text in text node
*
* @param DOMText $node
* @param array $wordsToHighlight
* @param string $color
*/
public function _highlightTextNode(DOMText $node, $wordsToHighlight, $color)
{
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$analyzer->setInput($node->nodeValue, $this->_doc->encoding);
$matchedTokens = array();
while (($token = $analyzer->nextToken()) !== null) {
if (isset($wordsToHighlight[$token->getTermText()])) {
$matchedTokens[] = $token;
}
}
if (count($matchedTokens) == 0) {
return;
}
$matchedTokens = array_reverse($matchedTokens);
foreach ($matchedTokens as $token) {
// Cut text after matched token
$node->splitText($token->getEndOffset());
// Cut matched node
$matchedWordNode = $node->splitText($token->getStartOffset());
$highlightedNode = $this->_doc->createElement('b', $matchedWordNode->nodeValue);
$highlightedNode->setAttribute('style', 'color:black;background-color:' . $color);
$node->parentNode->replaceChild($highlightedNode, $matchedWordNode);
}
}
/**
* highlight words in content of the specified node
*
* @param DOMNode $contextNode
* @param array $wordsToHighlight
* @param string $color
*/
public function _highlightNode(DOMNode $contextNode, $wordsToHighlight, $color)
{
$textNodes = array();
if (!$contextNode->hasChildNodes()) {
return;
}
foreach ($contextNode->childNodes as $childNode) {
if ($childNode->nodeType == XML_TEXT_NODE) {
// process node later to leave childNodes structure untouched
$textNodes[] = $childNode;
} else {
// Skip script nodes
if ($childNode->nodeName != 'script') {
$this->_highlightNode($childNode, $wordsToHighlight, $color);
}
}
}
foreach ($textNodes as $textNode) {
$this->_highlightTextNode($textNode, $wordsToHighlight, $color);
}
}
/**
* Highlight text with specified color
*
* @param string|array $words
* @param string $color
* @return string
*/
public function highlight($words, $color = '#66ffff')
{
if (!is_array($words)) {
$words = array($words);
}
$wordsToHighlight = array();
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
foreach ($words as $wordString) {
$wordsToHighlight = array_merge($wordsToHighlight, $analyzer->tokenize($wordString));
}
if (count($wordsToHighlight) == 0) {
return $this->_doc->saveHTML();
}
$wordsToHighlightFlipped = array();
foreach ($wordsToHighlight as $id => $token) {
$wordsToHighlightFlipped[$token->getTermText()] = $id;
}
$xpath = new DOMXPath($this->_doc);
$matchedNodes = $xpath->query("/html/body");
foreach ($matchedNodes as $matchedNode) {
$this->_highlightNode($matchedNode, $wordsToHighlightFlipped, $color);
}
}
/**
* Get HTML
*
* @return string
*/
public function getHTML()
{
return $this->_doc->saveHTML();
}
}
Lucene/Document/Docx.php 0000666 00000012276 15125712134 0011160 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Document_OpenXml */
require_once 'Zend/Search/Lucene/Document/OpenXml.php';
if (class_exists('ZipArchive', false)) {
/**
* Docx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Docx extends Zend_Search_Lucene_Document_OpenXml {
/**
* Xml Schema - WordprocessingML
*
* @var string
*/
const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
*/
private function __construct($fileName, $storeContent) {
// Document data holders
$documentBody = array();
$coreProperties = array();
// Open OpenXML package
$package = new ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relations = simplexml_load_string($package->getFromName('_rels/.rels'));
foreach($relations->Relationship as $rel) {
if ($rel ["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Read in contents...
$contents = simplexml_load_string($package->getFromName(
$this->absoluteZipPath(dirname($rel['Target'])
. '/'
. basename($rel['Target']))
));
$contents->registerXPathNamespace('w', Zend_Search_Lucene_Document_Docx::SCHEMA_WORDPROCESSINGML);
$paragraphs = $contents->xpath('//w:body/w:p');
foreach ($paragraphs as $paragraph) {
$runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
if ($runs === false) {
// Paragraph doesn't contain any text or breaks
continue;
}
foreach ($runs as $run) {
if ($run->getName() == 'br') {
// Break element
$documentBody[] = ' ';
} else {
$documentBody[] = (string)$run;
}
}
// Add space after each paragraph. So they are not bound together.
$documentBody[] = ' ';
}
break;
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', implode('', $documentBody), 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value) {
$this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (! isset($coreProperties['title'])) {
$this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Load Docx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Docx
* @throws Zend_Search_Lucene_Document_Exception
*/
public static function loadDocxFile($fileName, $storeContent = false) {
if (!is_readable($fileName)) {
require_once 'Zend/Search/Lucene/Document/Exception.php';
throw new Zend_Search_Lucene_Document_Exception('Provided file \'' . $fileName . '\' is not readable.');
}
return new Zend_Search_Lucene_Document_Docx($fileName, $storeContent);
}
}
} // end if (class_exists('ZipArchive'))
Lucene/Document/Exception.php 0000666 00000002143 15125712134 0012211 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/**
* Framework base exception
*/
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Exception extends Zend_Search_Lucene_Exception
{}
Lucene/Document/Pptx.php 0000666 00000016411 15125712134 0011211 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Document_OpenXml */
require_once 'Zend/Search/Lucene/Document/OpenXml.php';
if (class_exists('ZipArchive', false)) {
/**
* Pptx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Pptx extends Zend_Search_Lucene_Document_OpenXml
{
/**
* Xml Schema - PresentationML
*
* @var string
*/
const SCHEMA_PRESENTATIONML = 'http://schemas.openxmlformats.org/presentationml/2006/main';
/**
* Xml Schema - DrawingML
*
* @var string
*/
const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
/**
* Xml Schema - Slide relation
*
* @var string
*/
const SCHEMA_SLIDERELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide';
/**
* Xml Schema - Slide notes relation
*
* @var string
*/
const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
*/
private function __construct($fileName, $storeContent)
{
// Document data holders
$slides = array();
$slideNotes = array();
$documentBody = array();
$coreProperties = array();
// Open OpenXML package
$package = new ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relations = simplexml_load_string($package->getFromName("_rels/.rels"));
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Search for slides...
$slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
foreach ($slideRelations->Relationship as $slideRel) {
if ($slideRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDERELATION) {
// Found slide!
$slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) )
);
// Search for slide notes
$slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) );
foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
if ($slideNoteRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDENOTESRELATION) {
// Found slide notes!
$slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) )
);
break;
}
}
}
}
break;
}
}
// Sort slides
ksort($slides);
ksort($slideNotes);
// Extract contents from slides
foreach ($slides as $slideKey => $slide) {
// Register namespaces
$slide->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
$slide->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
// Fetch all text
$textElements = $slide->xpath('//a:t');
foreach ($textElements as $textElement) {
$documentBody[] = (string)$textElement;
}
// Extract contents from slide notes
if (isset($slideNotes[$slideKey])) {
// Fetch slide note
$slideNote = $slideNotes[$slideKey];
// Register namespaces
$slideNote->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
$slideNote->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
// Fetch all text
$textElements = $slideNote->xpath('//a:t');
foreach ($textElements as $textElement) {
$documentBody[] = (string)$textElement;
}
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value)
{
$this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (!isset($coreProperties['title']))
{
$this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Load Pptx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Pptx
*/
public static function loadPptxFile($fileName, $storeContent = false)
{
return new Zend_Search_Lucene_Document_Pptx($fileName, $storeContent);
}
}
} // end if (class_exists('ZipArchive'))
Lucene/Document/OpenXml.php 0000666 00000010461 15125712134 0011637 0 ustar 00 <?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';
if (class_exists('ZipArchive', false)) {
/**
* OpenXML document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2008 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Document_OpenXml extends Zend_Search_Lucene_Document
{
/**
* Xml Schema - Relationships
*
* @var string
*/
const SCHEMA_RELATIONSHIP = 'http://schemas.openxmlformats.org/package/2006/relationships';
/**
* Xml Schema - Office document
*
* @var string
*/
const SCHEMA_OFFICEDOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument';
/**
* Xml Schema - Core properties
*
* @var string
*/
const SCHEMA_COREPROPERTIES = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties';
/**
* Xml Schema - Dublin Core
*
* @var string
*/
const SCHEMA_DUBLINCORE = 'http://purl.org/dc/elements/1.1/';
/**
* Xml Schema - Dublin Core Terms
*
* @var string
*/
const SCHEMA_DUBLINCORETERMS = 'http://purl.org/dc/terms/';
/**
* Extract metadata from document
*
* @param ZipArchive $package ZipArchive OpenXML package
* @return array Key-value pairs containing document meta data
*/
protected function extractMetaData(ZipArchive $package)
{
// Data holders
$coreProperties = array();
// Read relations and search for core properties
$relations = simplexml_load_string($package->getFromName("_rels/.rels"));
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_COREPROPERTIES) {
// Found core properties! Read in contents...
$contents = simplexml_load_string(
$package->getFromName(dirname($rel["Target"]) . "/" . basename($rel["Target"]))
);
foreach ($contents->children(Zend_Search_Lucene_Document_OpenXml::SCHEMA_DUBLINCORE) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
foreach ($contents->children(Zend_Search_Lucene_Document_OpenXml::SCHEMA_COREPROPERTIES) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
foreach ($contents->children(Zend_Search_Lucene_Document_OpenXml::SCHEMA_DUBLINCORETERMS) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
}
}
return $coreProperties;
}
/**
* Determine absolute zip path
*
* @param string $path
* @return string
*/
protected function absoluteZipPath($path) {
$path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path);
$parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen');
$absolutes = array();
foreach ($parts as $part) {
if ('.' == $part) continue;
if ('..' == $part) {
array_pop($absolutes);
} else {
$absolutes[] = $part;
}
}
return implode('/', $absolutes);
}
}
} // end if (class_exists('ZipArchive'))