专注于互联网--专注于架构

最新标签
网站地图
文章索引
Rss订阅

首页 »PHP教程 » mysql全文检索:对MYSQL进行全文检索的PHP类库 »正文

mysql全文检索:对MYSQL进行全文检索的PHP类库

来源: 发布时间:星期一, 2009年1月12日 浏览:23次 评论:0
  真是好东西,但我还没研究出来如果要看这个详细介绍说明和演示请到这个地方看很爽: http://steven.haryan.to/php/KwIndex.html

  注意只能在linux,unix下用

<?php
  
$debug = 0;
  
($debug) require "Dumper.lib"; //这个全文检索需要库文件你有吗?
  
function _debug {
global $debug;
  
$args = func_get_args;
(!$debug) ;
echo "<pre>debug: ", htmlentities(join("", $args)), "</pre><br>n";
}
  
KwIndex {
  
# CONSTRUCTOR
#############
  
function KwIndex($args) {
# check for argument type
(!is_.gif' />(&$args))
die("KwIndex: constructor: syntax: KwIndex(.gif' /> $args)");
  
# check for unknown arguments
$known_arguments = .gif' />_flip(.gif' />(
"linkid", "db_name", "hostname", "username", "password",
"index_name", "wordlist_cardinality", "doclist_cardinality",
"stoplist_cardinality", "vectorlist_cardinality",
"max_word_length", "use_persistent_connection"));
while(list($k,$v) = each($args))
(!is($known_arguments[$k]))
die("KwIndex: constructor: unknown argument `$k'");
  
# required for required arguments
(!is($args["db_name"]))
die("KwIndex: constructor: You must specy 'db_name'");
(!is($args["linkid"]) &&
(!is($args["hostname"]) || !is($args["username"]) ||
!is($args["password"])))
die("KwIndex: constructor: You must either specy 'linkid' or ".
"arguments to mysql_connect ('hostname', 'username', and ".
"'password')");
  
# supply default values for optional arguments
(!is($args["index_name"]))
$args["index_name"] = "kwindex";
(!is($args["wordlist_cardinality"]))
$args["wordlist_cardinality"] = 100000;
(!is($args["stoplist_cardinality"]))
$args["stoplist_cardinality"] = 10000;
(!is($args["vectorlist_cardinality"]))
$args["vectorlist_cardinality"] = 100000000;
(!is($args["doclist_cardinality"]))
$args["doclist_cardinality"] = 1000000;
(!is($args["max_word_length"]))
$args["max_word_length"] = 32;
(!is($args["use_persistent_connection"]))
$args["use_persistent_connection"] = 1;
  
# object attributes
$this->db_name = $args["db_name"];
$this->index_name = $args["index_name"];
$this->wordlist_cardinality = $args["wordlist_cardinality"];
$this->stoplist_cardinality = $args["stoplist_cardinality"];
$this->vectorlist_cardinality = $args["vectorlist_cardinality"];
$this->doclist_cardinality = $args["doclist_cardinality"];
$this->max_word_length = $args["max_word_length"];
  
(!is($args["linkid"])) {
($args["use_persistent_connection"]) {
$linkid = mysql_pconnect($args["hostname"], $args["username"],
$args["password"]);
} {
$linkid = mysql_connect($args["hostname"], $args["username"],
$args["password"]);
}
(!$linkid)
die("KwIndex: constructor: Can't connect to database: ".
mysql_error);
} {
$linkid = $args["linkid"];
}
  
$this->linkid = $linkid;
$idx = $this->index_name;
  
(!mysql_select_db($this->db_name, $linkid))
die("KwIndex: constructor: Can't select DB: ".
mysql_error($linkid));
  
(!$this->_index_exists) $this->_create_index;
  
# load stoplist as keys of .gif' />
$this->stoplist = .gif' />;
(!($res = mysql_query("SELECT word FROM ${idx}_stoplist",
$linkid)))
die("KwIndex: constructor: Can't load stoplist: ".
mysql_error($linkid));
while($row = mysql_fetch_row($res)) {
$this->stoplist[ strtolower($row[0]) ] = 1;
}
} // constructor
  
# PUBLIC METHODS
################
  
function &document_sub($doc_ids) {
die("KwIndex: document_sub: this method must be overriden");
}
  
function add_document($doc_ids) {
(!is_.gif' />(&$doc_ids))
die("KwIndex: syntax: add_document(.gif' /> $doc_ids)");
  
(!(&$doc_ids)) 1;
  
$wordlist = .gif' />;
# structure: ( 'word1' => [ [doc_id,freq], ... ], ... )
$doclist = .gif' />;
# format: ( doc_id => n, ... ); # n = number of words in document
  
# retrieve documents
####################
$docs = $this->document_sub(&$doc_ids);
  
(!is_.gif' />(&$docs))
die("KwIndex: add_document: ".
"'document_sub' does not an .gif' />");
((&$doc_ids) < (&$docs))
die("KwIndex: add_document: ".
"'document_sub' does not enough documents");
((&$doc_ids) > (&$docs))
die("KwIndex: add_document: ".
"'document_sub' s too many documents");
  
# split documents o words
############################
while(list($id, $doc) = each($docs)) {
(!is($doc) || !strlen($doc)) continue;
  
$words = $this->_split_to_words($doc);
$num_of_words = (&$words);
# note: this means that numbers, etc are counted
$doclist[$id] = $num_of_words;
  
# filter non-qualying words: 1-char length, numbers, words
# that are too long
  
$w2 = .gif' />;
while(list($k, $v) = each($words)) {
$len = strlen($v);
$lower_v = strtolower($v);
($len > 1 &&
$len <= $this->max_word_length &&
preg_match("/[a-z]/", $lower_v) &&
!is($this->stoplist[$lower_v])) $w2[ $lower_v ];
}
  
while(list($k, $v) = each($w2)) {
$lower_k = strtolower($k);
(!is($wordlist[$lower_k]))
$wordlist[$lower_k] = .gif' />;
.gif' />_push($wordlist[$lower_k], .gif' />($id, $v/$num_of_words));
}
}
  
#_debug("wordlist: ", Dumper($wordlist));
  
# submit to database
####################
$linkid = $this->linkid;
$idx = $this->index_name;
  
# lock the tables in some other process remove a certain word
# between step 0 and 1 and 2 and 3
(!mysql_query("LOCK TABLES ${idx}_doclist WRITE, ".
"${idx}_vectorlist WRITE, ".
"${idx}_wordlist WRITE",
$linkid)) {
$this->ERROR = "Can't lock tables when adding documents: ".
mysql_error($linkid);
;
}
  
# 0
# add the docs first
#_debug( "doclist = ", Dumper($doclist));
while(list($k,$v) = each($doclist)) {
(!mysql_query("REPLACE INTO ${idx}_doclist (id,n) VALUES (".
"'".(addslashes($k))."'".
",".
"'".(addslashes($v))."'".
")",
$linkid)) {
$this->ERROR = "Can't add doc id=`$_' to doclist: ".
mysql_error($linkid);
mysql_query("UNLOCK TABLES", $linkid);
;
}
}
  
# 1
# and then add the words
while(list($k,$v) = each($wordlist)) {
(!mysql_query("INSERT IGNORE INTO ${idx}_wordlist (word) ".
"VALUES (".
"'".(addslashes($k))."'".
")",
$linkid)) {
$this->ERROR = "Can't add word `$k' to wordlist: ".
mysql_error($linkid);
mysql_query("UNLOCK TABLES", $linkid);


;
}
}
  
# 2
# get the resulting word ids
$word_ids = .gif' />;
$e_wordlist = .gif' />;
re($wordlist);
while(list($k,$v) = each($wordlist))
.gif' />_push($e_wordlist, "'".addslashes($k)."'");
  
((&$wordlist)) {
(!($res = mysql_query("SELECT id,word FROM ${idx}_wordlist ".
"WHERE word IN (".
join(',', $e_wordlist).
")",
$linkid))) {
$this->ERROR = "Can't get data from wordlist: ".
mysql_error($linkid);
mysql_query('UNLOCK TABLES', $linkid);
;
}
while($row = mysql_fetch_row($res)) {
#echo "row = ", Dumper($row), "<br>n";
$word_ids[ $row[1] ] = $row[0];
}
}
  
# 3
# now add the vectors
#_debug("word_ids = ", Dumper($word_ids));
re($wordlist);
while (list($word, $hitlist) = each($wordlist)) {
while(list($i, $hit) = each($hitlist)) {
#echo "adding word=$word, hit = ($hit[0], $hit[1])<br>n";
(!mysql_query("INSERT INTO ${idx}_vectorlist (wid,did,f)".
"VALUES (".
("'".addslashes($word_ids[$word]))."',".
("'".addslashes($hit[0]))."',".
("'".addslashes($hit[1]))."')",
$linkid)) {
$this->ERROR = "Can't add to vectorlist: ".
mysql_error($linkid);
mysql_query('UNLOCK TABLES', $linkid);
;
}
}
}
  
# all goes well, TRUE
mysql_query('UNLOCK TABLES', $linkid);
1;
}
  
function remove_document($doc_ids) {
(!is_.gif' />(&$doc_ids))
die("KwIndex: syntax: remove_document(.gif' /> $doc_ids)");
  
(!(&$doc_ids)) 1;
  
$linkid = $this->linkid;
$idx = $this->index_name;
  
(!mysql_query("LOCK TABLES ${idx}_doclist WRITE, ".
"${idx}_vectorlist WRITE",
$linkid)) {
$this->ERROR = "Can't lock tables when removing documents: ".
mysql_error($linkid);
;
}
  
(!mysql_query("DELETE FROM ${idx}_doclist WHERE id IN (".
join(',', $doc_ids).")",
$linkid)) {
$this->ERROR = "Can't delete from doclist: ".
mysql_error($linkid);
mysql_query('UNLOCK TABLES', $linkid);
;
}
  
(!mysql_query("DELETE FROM ${idx}_vectorlist WHERE did IN (".
join(',', $doc_ids).")",
$linkid)) {
$this->ERROR = "Can't delete from vectorlist: ".
mysql_error($linkid);
mysql_query('UNLOCK TABLES', $linkid);
;
}
  
# all goes well, TRUE
mysql_query("UNLOCK TABLES", $linkid);
1;
}
  
function update_document($doc_ids) {
(!is_.gif' />(&$doc_ids))
die("KwIndex: syntax: update_document(.gif' /> $doc_ids)");
  
(!(&$doc_ids)) 1;
  
$this->remove_document(&$doc_ids) &&
$this->add_document(&$doc_ids);
}
  
# find all words that are contained in at least $k % of all documents
  
function &common_word($k = 80) {
$linkid = $this->linkid;
$idx = $this->index_name;
  
# first select the number of documents
$num = $this->document_count;
($num -1) {
$this->ERROR = "Can't retrieve the number of documents: ".
mysql_error($linkid);
;
}
  
# get the statistics from vectorlist
(!($res1 = mysql_query("SELECT wid,COUNT(*)/$num as k FROM
${idx}_vectorlist GROUP BY wid HAVING k>=".
($k/100),
$linkid))) {
$this->ERROR = "Can't retrieve common words: ".
mysql_error($linkid);
;
}
$wids = .gif' />;
while($row = mysql_fetch_row($res1)) .gif' />_push($wids, $row[0]);
  
# convert it to word by consulting the wordlist table
$words = .gif' />;
((&$wids)) {
(!($res2 = mysql_query("SELECT word FROM ${idx}_wordlist ".
"WHERE id IN (".join(',', $wids).
")",
$linkid))) {
$this->ERROR = "Can't retrieve common words: ".
mysql_error($linkid);
;
}
  
while($row = mysql_fetch_row($res2))
.gif' />_push($words, $row[0]);
}
  
$words;
}
  
# find all words that are not contained in all documents (vectorlist)
# XXX not yet written
  
function &orphan_word {
.gif' />;
}
  
# remove words from index
  
function remove_word($words) {
(!is_.gif' />(&$words))
die("KwIndex: syntax: remove_word(.gif' /> $words)");
  
$linkid = $this->linkid;
$idx = $this->index_name;
  
(!mysql_query("LOCK TABLES ${idx}_wordlist WRITE, ".
"${idx}_vectorlist WRITE",
$linkid)) {
$this->ERROR = "Can't lock tables when removing words: ".
mysql_error($linkid);
;
}
  
$e_words = .gif' />;
while(list($k,$v) = each($words))
.gif' />_push($e_words, "'".addslashes(strtolower($v))."'");
  
# retrieve word ids
(!($res0 = mysql_query("SELECT id FROM ${idx}_wordlist WHERE ".
"word IN (".join(',', $e_words).")",
$linkid))) {
$this->ERROR = "Can't delete from wordlist: ".
mysql_error($linkid);
mysql_query('UNLOCK TABLES', $linkid);
;
}
  
$word_ids = .gif' />;
while($row = mysql_fetch_row($res0))
.gif' />_push($word_ids, $row[0]);
$word_ids = join(',', $word_ids); # we'll make it a
  
(!strlen($word_ids)) 1;
  
# delete from wordlist
(!mysql_query("DELETE FROM ${idx}_wordlist WHERE id IN ".
"($word_ids)",
$linkid)) {
$this->ERROR = "Can't delete from wordlist: ".
mysql_error($linkid);
mysql_query("UNLOCK TABLES", $linkid);
;
}
(!mysql_query("DELETE FROM ${idx}_vectorlist WHERE wid IN ".
"($word_ids)", $linkid)) {
$this->ERROR = "Can't delete from vectorlist: ".
mysql_error($linkid);
mysql_query("UNLOCK TABLES", $linkid);
;
}
  
1;
}
  
# add stop words. note: you must manually delete previously indexed
# words with delete_word
  
function add_stop_word($words) {
(!is_.gif' />(&$words))
die("KwIndex: syntax: add_stop_word(.gif' /> words)");
  
(!(&$words)) 1;
  
$linkid = $this->linkid;
$idx = $this->index_name;
  
while(list($k, $v) = each($words)) {
$lv = strtolower($v);
(!mysql_query("REPLACE INTO ${idx}_stoplist (word) VALUES (".
("'".addslashes($lv)."'").
")", $linkid)) {
$this->ERROR = "Can't add to stoplist: ".
mysql_error($linkid);
;
}
$this->stoplist[ $lv ] = 1;
}
  
1;
}
  
# remove stop words from index
  
function remove_stop_word($words) {
(!is_.gif' />(&$words))
die("KwIndex: syntax: remove_stop_word(.gif' /> words)");
  
(!(&$words)) 1;
  
$linkid = $this->linkid;
$idx = $this->index_name;
  
$e_words = .gif' />;
while(list($k,$v) = each($words))
.gif' />_push($e_words, "'".addslashes(strtolower($v))."'");
  
(!mysql_query("DELETE FROM ${idx}_stoplist WHERE word IN (".
join(',', $e_words).")",
$linkid)) {
$this->ERROR = "Can't delete from stoplist: ".
mysql_error($linkid);
;
}
  
while(list($k,$v) = each($words))
un($this->stoplist[ strtolower($v) ]);
  
1;
}
  
function is_stop_word($word) {
is($this->stoplist[ strtolower($word) ]);
}
  
function &_search_or_match_count($is_count, &$args) {
(!is($args["words"]))
die("KwIndex: search: option 'words' must be d");
  
$linkid = $this->linkid;
$idx = $this->index_name;
  
# split the words we are offered a single /not .gif' />
# (assume it's a phrase)
  
$words = .gif' />;
(is_.gif' />(&$args["words"])) {
$words = $args["words"];
} {
$words = $this->_split_to_words($args["words"]);
}
  
# delete duplicate words, convert them all to lower
$hashwords = .gif' />;
while(list($k,$v) = each($words)) $hashwords[ strtolower($v) ] = 1;
$words = .gif' />_keys(&$hashwords);
  
(!(&$words)) {
($is_count) {
0;
} {
.gif' />;
}
}
  
# first we retrieve the word ids
$op = $args["re"] ? 'REGEXP':'LIKE';
$bool = is($args['boolean']) && $args['boolean'] &&
strtoupper($args['boolean']) 'AND' ? 'AND':'OR';
  
$op_phrases = .gif' />;
while(list($k,$v) = each($words))
.gif' />_push($op_phrases, "word $op '".addslashes($v)."'");
  
(!($res0 = mysql_query("SELECT id FROM ${idx}_wordlist WHERE ".
join(' OR ', $op_phrases),
$linkid))) {
$this->ERROR = "Can't retrieve word ids: ".mysql_error($linkid);
;
}
$word_ids = .gif' />;
while($row = mysql_fetch_row($res0)) .gif' />_push($word_ids, $row[0]);
  
(!(&$word_ids) ||
($bool 'AND' && (&$word_ids) < (&$words))) {
($is_count) {
0;
} {
.gif' />;
}
}
  
# and then we search the vectorlist
$can_optimize=0;
$stmt = ';
  
($is_count) {
  
($bool 'AND' && !$args['re']) {
$stmt = 'SELECT did,count(wid) as c '.
"FROM ${idx}_vectorlist WHERE wid IN (".
join(',',$word_ids).
") ".
"GROUP BY did ".
"HAVING c >= ".(&$word_ids);
} {
$can_optimize=1;
$stmt = "SELECT COUNT(DISTINCT did) ".
"FROM ${idx}_vectorlist WHERE wid IN (".
join(',',$word_ids).
")";
}
  
} { // ! $is_count
  
$stmt = "SELECT did, count(wid) as c, avg(f) as a, ".
"count(wid)*count(wid)*count(wid)*avg(f) as ca ".
  
"FROM ${idx}_vectorlist WHERE wid IN (".
join(',',$word_ids).
") ".
  
"GROUP BY did ".
($bool 'AND' && !$args['re'] ?
"HAVING c >= ".(&$word_ids):').
" ORDER BY ca DESC ".
(is($args['num']) ? "LIMIT " . (is($args['start']) ?
(($args['start'] - 1).",".$args['num']) : $args['num'])
:');
  
}
  
_debug("search SQL: ", $stmt);
  
(!($res = mysql_query($stmt, $linkid))) {
$this->ERROR = "Can't search vectorlist: ".mysql_error($linkid);
;
}
  
($is_count) {
  
($can_optimize) {
$row = mysql_fetch_row($res);
$row[0];
} {
mysql_num_rows($res);
}
  
} { // ! $is_count
  
$doc_ids = .gif' />;
while($row = mysql_fetch_row($res)) .gif' />_push($doc_ids, $row[0]);
$doc_ids;
}
}
  
function &search($args) {
$this->_search_or_match_count(0, &$args);
}
  
function &match_count($args) {
$this->_search_or_match_count(1, &$args);
}
  
function remove_index {
$linkid = $this->linkid;
$idx = $this->index_name;
  
(!mysql_query("DROP TABLE IF EXISTS ${idx}_wordlist", $linkid)) {
$this->ERROR = "Can't remove table ${idx}_wordlist: ".
mysql_error($linkid);
;
}
(!mysql_query("DROP TABLE IF EXISTS ${idx}_doclist", $linkid)) {
$this->ERROR = "Can't remove table ${idx}_doclist: ".
mysql_error($linkid);
;
}
(!mysql_query("DROP TABLE IF EXISTS ${idx}_vectorlist", $linkid)) {
$this->ERROR = "Can't remove table ${idx}_vectorlist: ".
mysql_error($linkid);
;
}
(!mysql_query("DROP TABLE IF EXISTS ${idx}_stoplist", $linkid)) {
$this->ERROR = "Can't remove table ${idx}_stoplist: ".
mysql_error($linkid);
;
}
  
1;
}
  
function empty_index {
$this->remove_index && $this->_create_index;
}
  
# number of documents in the collection
function document_count {
$linkid = $this->linkid;
$idx = $this->index_name;
  
(!($res = mysql_query("SELECT COUNT(*) FROM ${idx}_doclist",
$linkid))) ;
$row = mysql_fetch_row($res);
$row[0];
}
  
# number of unique words
function word_count {
$linkid = $this->linkid;
$idx = $this->index_name;
  
(!($res = mysql_query("SELECT COUNT(*) FROM ${idx}_wordlist",
$linkid))) ;
$row = mysql_fetch_row($res);
$row[0];
}
  
# PRIVATE METHODS
#################
  
function &_split_to_words($str) {
preg_match_all("/b(w[w']*w+|w+)b/", $str, $matches);
$matches[1];
}


>function _create_index {
$linkid = $this->linkid;
$idx = $this->index_name;
  
$stmt = ';
  
# drop previous tables, they exist
(!$this->remove_index) ;
  
# create doclist table
$stmt = "CREATE TABLE ${idx}_doclist " .
'(id ' . $this->__column_type($this->doclist_cardinality).
' AUTO_INCREMENT PRIMARY KEY, ' .
' n ' . $this->__column_type($this->wordlist_cardinality).
' NOT NULL'.
')';
(!mysql_query($stmt, $linkid))
die("KwIndex: Can't create table ${idx}_doclist: ".
mysql_error($linkid));
  
# create wordlist table
$stmt = "CREATE TABLE ${idx}_wordlist " .
'(id ' . $this->__column_type($this->wordlist_cardinality) .
' AUTO_INCREMENT PRIMARY KEY, ' .
" word VARCHAR($this->max_word_length)" .
' BINARY NOT NULL, ' .
"UNIQUE (word) " .
')';
(!mysql_query($stmt, $linkid))
die("KwIndex: Can't create table ${idx}_wordlist: ".
mysql_error($linkid));
  
# create stoplist table
$stmt = "CREATE TABLE ${idx}_stoplist " .
'(id ' . $this->__column_type($this->wordlist_cardinality).
' AUTO_INCREMENT PRIMARY KEY, ' .
" word VARCHAR($this->max_word_length)" .
' BINARY NOT NULL, ' .
"UNIQUE (word) " .
')';
(!mysql_query($stmt, $linkid))
die("KwIndex: Can't create table ${idx}_stoplist: ".
mysql_error($linkid));
  
# create vectorlist table
$stmt = "CREATE TABLE ${idx}_vectorlist " .
'(wid '. $this->__column_type($this->wordlist_cardinality).
' NOT NULL, ' .
'did '. $this->__column_type($this->doclist_cardinality).
' NOT NULL, '.
'UNIQUE (wid,did), '.
'f FLOAT(10,4) NOT NULL'.
')';
(!mysql_query($stmt, $linkid)) {
die("KwIndex: Can't create table ${idx}_stoplist: ".
mysql_error($linkid));
}
  
$this->stoplist = .gif' />;
  
1;
}
  
function __column_type($cardinality) {
($cardinality >= 16*1024*1024) 'INT UNSIGNED';
($cardinality >= 64*1024) 'MEDIUMINT UNSIGNED';
($cardinality >= 256) 'SMALLINT UNSIGNED';
'TINYINT UNSIGNED';
}
  
function &_index_tables {
$idx = $this->index_name;
  
$tables = .gif' />(
"${idx}_doclist",
"${idx}_wordlist",
"${idx}_vectorlist",
"${idx}_stoplist");
  
$tables;
}
  
function _index_exists {
$linkid = $this->linkid;
$idx = $this->index_name;
  
(!($res = mysql_list_tables($this->db_name, $linkid)))
die("Can't list table: ".mysql_error($linkid));
  
$existing_tables = .gif' />;
$i = 0;
while ($i < mysql_num_rows($res)) {
$existing_tables[ mysql_tablename($res, $i) ] = 1;
$i;
}
  
$index_tables = $this->_index_tables;
while(list($k,$v) = each($index_tables)) {
(!is($existing_tables[$v])) 0;
}
  
1;
}
  
} //
  
?>


0

相关文章

读者评论

发表评论

  • 昵称:
  • 内容: