首页 »PHP教程 » mysql全文检索:对MYSQL进行全文检索的PHP类库 »正文
mysql全文检索:对MYSQL进行全文检索的PHP类库
来源: 发布时间:星期一, 2009年1月12日 浏览:23次 评论:0
真是好东西,但我还没研究出来 ![](/icons/53967dou.gif) 如果要看这个 ![](/icons/53967chengxu.gif) ![](/icons/53967de.gif) 详细介绍说明和演示 ![](/icons/53967dou.gif) 请到这个地方看 ![](/icons/53967dou.gif) 真 ![](/icons/53967de.gif) 很爽: http://steven.haryan.to/php/KwIndex.html 注意 ![](/icons/53967dou.gif) 只能在linux,unix下用 ![](/icons/53967dou2.gif) <?php $debug = 0;
($debug) require "Dumper.lib"; //这个全文检索需要 库文件 你有吗? function _debug { global $debug; $args = func_get_args ;
(!$debug) ; echo "<pre>debug: ", htmlentities(join("", $args)), "</pre><br>n"; }
KwIndex { # CONSTRUCTOR ############# function KwIndex($args) { # check for argument type
(!is_ .gif' />(&$args)) die("KwIndex: constructor: syntax: KwIndex( .gif' /> $args)"); # check for unknown arguments $known_arguments = .gif' />_flip( .gif' />( "linkid", "db_name", "hostname", "username", "password", "index_name", "wordlist_cardinality", "doclist_cardinality", "stoplist_cardinality", "vectorlist_cardinality", "max_word_length", "use_persistent_connection")); while(list($k,$v) = each($args))
(!is ($known_arguments[$k])) die("KwIndex: constructor: unknown argument `$k'"); # required for required arguments
(!is ($args["db_name"])) die("KwIndex: constructor: You must spec y 'db_name'");
(!is ($args["linkid"]) && (!is ($args["hostname"]) || !is ($args["username"]) || !is ($args["password"]))) die("KwIndex: constructor: You must either spec y 'linkid' or ". "arguments to mysql_connect ('hostname', 'username', and ". "'password')"); # supply default values for optional arguments
(!is ($args["index_name"])) $args["index_name"] = "kwindex";
(!is ($args["wordlist_cardinality"])) $args["wordlist_cardinality"] = 100000;
(!is ($args["stoplist_cardinality"])) $args["stoplist_cardinality"] = 10000;
(!is ($args["vectorlist_cardinality"])) $args["vectorlist_cardinality"] = 100000000;
(!is ($args["doclist_cardinality"])) $args["doclist_cardinality"] = 1000000;
(!is ($args["max_word_length"])) $args["max_word_length"] = 32;
(!is ($args["use_persistent_connection"])) $args["use_persistent_connection"] = 1; # object attributes $this->db_name = $args["db_name"]; $this->index_name = $args["index_name"]; $this->wordlist_cardinality = $args["wordlist_cardinality"]; $this->stoplist_cardinality = $args["stoplist_cardinality"]; $this->vectorlist_cardinality = $args["vectorlist_cardinality"]; $this->doclist_cardinality = $args["doclist_cardinality"]; $this->max_word_length = $args["max_word_length"];
(!is ($args["linkid"])) {
($args["use_persistent_connection"]) { $linkid = mysql_pconnect($args["hostname"], $args["username"], $args["password"]); } { $linkid = mysql_connect($args["hostname"], $args["username"], $args["password"]); }
(!$linkid) die("KwIndex: constructor: Can't connect to database: ". mysql_error ); } { $linkid = $args["linkid"]; } $this->linkid = $linkid; $idx = $this->index_name;
(!mysql_select_db($this->db_name, $linkid)) die("KwIndex: constructor: Can't select DB: ". mysql_error($linkid));
(!$this->_index_exists ) $this->_create_index ; # load stoplist as keys of .gif' /> $this->stoplist = .gif' /> ;
(!($res = mysql_query("SELECT word FROM ${idx}_stoplist", $linkid))) die("KwIndex: constructor: Can't load stoplist: ". mysql_error($linkid)); while($row = mysql_fetch_row($res)) { $this->stoplist[ strtolower($row[0]) ] = 1; } } // constructor # PUBLIC METHODS ################ function &document_sub($doc_ids) { die("KwIndex: document_sub: this method must be overriden"); } function add_document($doc_ids) {
(!is_ .gif' />(&$doc_ids)) die("KwIndex: syntax: add_document( .gif' /> $doc_ids)");
(! (&$doc_ids)) 1; $wordlist = .gif' /> ; # structure: ( 'word1' => [ [doc_id,freq], ... ], ... ) $doclist = .gif' /> ; # format: ( doc_id => n, ... ); # n = number of words in document # retrieve documents #################### $docs = $this->document_sub(&$doc_ids);
(!is_ .gif' />(&$docs)) die("KwIndex: add_document: ". "'document_sub' does not an .gif' />");
( (&$doc_ids) < (&$docs)) die("KwIndex: add_document: ". "'document_sub' does not enough documents");
( (&$doc_ids) > (&$docs)) die("KwIndex: add_document: ". "'document_sub' s too many documents"); # split documents o words ############################ while(list($id, $doc) = each($docs)) {
(!is ($doc) || !strlen($doc)) continue; $words = $this->_split_to_words($doc); $num_of_words = (&$words); # note: this means that numbers, etc are counted $doclist[$id] = $num_of_words; # filter non-qual ying words: 1-char length, numbers, words # that are too long $w2 = .gif' /> ; while(list($k, $v) = each($words)) { $len = strlen($v); $lower_v = strtolower($v);
($len > 1 && $len <= $this->max_word_length && preg_match("/[a-z]/", $lower_v) && !is ($this->stoplist[$lower_v])) $w2[ $lower_v ]; } while(list($k, $v) = each($w2)) { $lower_k = strtolower($k);
(!is ($wordlist[$lower_k])) $wordlist[$lower_k] = .gif' /> ;
.gif' />_push($wordlist[$lower_k], .gif' />($id, $v/$num_of_words)); } } #_debug("wordlist: ", Dumper($wordlist)); # submit to database #################### $linkid = $this->linkid; $idx = $this->index_name; # lock the tables in some other process remove a certain word # between step 0 and 1 and 2 and 3
(!mysql_query("LOCK TABLES ${idx}_doclist WRITE, ". "${idx}_vectorlist WRITE, ". "${idx}_wordlist WRITE", $linkid)) { $this->ERROR = "Can't lock tables when adding documents: ". mysql_error($linkid);
; } # 0 # add the docs first #_debug( "doclist = ", Dumper($doclist)); while(list($k,$v) = each($doclist)) {
(!mysql_query("REPLACE INTO ${idx}_doclist (id,n) VALUES (". "'".(addslashes($k))."'". ",". "'".(addslashes($v))."'". ")", $linkid)) { $this->ERROR = "Can't add doc id=`$_' to doclist: ". mysql_error($linkid); mysql_query("UNLOCK TABLES", $linkid);
; } } # 1 # and then add the words while(list($k,$v) = each($wordlist)) {
(!mysql_query("INSERT IGNORE INTO ${idx}_wordlist (word) ". "VALUES (". "'".(addslashes($k))."'". ")", $linkid)) { $this->ERROR = "Can't add word `$k' to wordlist: ". mysql_error($linkid); mysql_query("UNLOCK TABLES", $linkid);
![](/icons/53967return.gif)
; } } # 2 # get the resulting word ids $word_ids = .gif' /> ; $e_wordlist = .gif' /> ; re ($wordlist); while(list($k,$v) = each($wordlist))
.gif' />_push($e_wordlist, "'".addslashes($k)."'");
( (&$wordlist)) {
(!($res = mysql_query("SELECT id,word FROM ${idx}_wordlist ". "WHERE word IN (". join(',', $e_wordlist). ")", $linkid))) { $this->ERROR = "Can't get data from wordlist: ". mysql_error($linkid); mysql_query('UNLOCK TABLES', $linkid);
; } while($row = mysql_fetch_row($res)) { #echo "row = ", Dumper($row), "<br>n"; $word_ids[ $row[1] ] = $row[0]; } } # 3 # now add the vectors #_debug("word_ids = ", Dumper($word_ids)); re ($wordlist); while (list($word, $hitlist) = each($wordlist)) { while(list($i, $hit) = each($hitlist)) { #echo "adding word=$word, hit = ($hit[0], $hit[1])<br>n";
(!mysql_query("INSERT INTO ${idx}_vectorlist (wid,did,f)". "VALUES (". ("'".addslashes($word_ids[$word]))."',". ("'".addslashes($hit[0]))."',". ("'".addslashes($hit[1]))."')", $linkid)) { $this->ERROR = "Can't add to vectorlist: ". mysql_error($linkid); mysql_query('UNLOCK TABLES', $linkid);
; } } } # all goes well, TRUE mysql_query('UNLOCK TABLES', $linkid);
1; } function remove_document($doc_ids) {
(!is_ .gif' />(&$doc_ids)) die("KwIndex: syntax: remove_document( .gif' /> $doc_ids)");
(! (&$doc_ids)) 1; $linkid = $this->linkid; $idx = $this->index_name;
(!mysql_query("LOCK TABLES ${idx}_doclist WRITE, ". "${idx}_vectorlist WRITE", $linkid)) { $this->ERROR = "Can't lock tables when removing documents: ". mysql_error($linkid);
; }
(!mysql_query("DELETE FROM ${idx}_doclist WHERE id IN (". join(',', $doc_ids).")", $linkid)) { $this->ERROR = "Can't delete from doclist: ". mysql_error($linkid); mysql_query('UNLOCK TABLES', $linkid);
; }
(!mysql_query("DELETE FROM ${idx}_vectorlist WHERE did IN (". join(',', $doc_ids).")", $linkid)) { $this->ERROR = "Can't delete from vectorlist: ". mysql_error($linkid); mysql_query('UNLOCK TABLES', $linkid);
; } # all goes well, TRUE mysql_query("UNLOCK TABLES", $linkid);
1; } function update_document($doc_ids) {
(!is_ .gif' />(&$doc_ids)) die("KwIndex: syntax: update_document( .gif' /> $doc_ids)");
(! (&$doc_ids)) 1;
$this->remove_document(&$doc_ids) && $this->add_document(&$doc_ids); } # find all words that are contained in at least $k % of all documents function &common_word($k = 80) { $linkid = $this->linkid; $idx = $this->index_name; # first select the number of documents $num = $this->document_count ;
($num -1) { $this->ERROR = "Can't retrieve the number of documents: ". mysql_error($linkid);
; } # get the statistics from vectorlist
(!($res1 = mysql_query("SELECT wid,COUNT(*)/$num as k FROM ${idx}_vectorlist GROUP BY wid HAVING k>=". ($k/100), $linkid))) { $this->ERROR = "Can't retrieve common words: ". mysql_error($linkid);
; } $wids = .gif' /> ; while($row = mysql_fetch_row($res1)) .gif' />_push($wids, $row[0]); # convert it to word by consulting the wordlist table $words = .gif' /> ;
( (&$wids)) {
(!($res2 = mysql_query("SELECT word FROM ${idx}_wordlist ". "WHERE id IN (".join(',', $wids). ")", $linkid))) { $this->ERROR = "Can't retrieve common words: ". mysql_error($linkid);
; } while($row = mysql_fetch_row($res2))
.gif' />_push($words, $row[0]); }
$words; } # find all words that are not contained in all documents (vectorlist) # XXX not yet written function &orphan_word {
.gif' /> ; } # remove words from index function remove_word($words) {
(!is_ .gif' />(&$words)) die("KwIndex: syntax: remove_word( .gif' /> $words)"); $linkid = $this->linkid; $idx = $this->index_name;
(!mysql_query("LOCK TABLES ${idx}_wordlist WRITE, ". "${idx}_vectorlist WRITE", $linkid)) { $this->ERROR = "Can't lock tables when removing words: ". mysql_error($linkid);
; } $e_words = .gif' /> ; while(list($k,$v) = each($words))
.gif' />_push($e_words, "'".addslashes(strtolower($v))."'"); # retrieve word ids
(!($res0 = mysql_query("SELECT id FROM ${idx}_wordlist WHERE ". "word IN (".join(',', $e_words).")", $linkid))) { $this->ERROR = "Can't delete from wordlist: ". mysql_error($linkid); mysql_query('UNLOCK TABLES', $linkid);
; } $word_ids = .gif' /> ; while($row = mysql_fetch_row($res0))
.gif' />_push($word_ids, $row[0]); $word_ids = join(',', $word_ids); # we'll make it a ![](/icons/53967string.gif)
(!strlen($word_ids)) 1; # delete from wordlist
(!mysql_query("DELETE FROM ${idx}_wordlist WHERE id IN ". "($word_ids)", $linkid)) { $this->ERROR = "Can't delete from wordlist: ". mysql_error($linkid); mysql_query("UNLOCK TABLES", $linkid);
; }
(!mysql_query("DELETE FROM ${idx}_vectorlist WHERE wid IN ". "($word_ids)", $linkid)) { $this->ERROR = "Can't delete from vectorlist: ". mysql_error($linkid); mysql_query("UNLOCK TABLES", $linkid);
; }
1; } # add stop words. note: you must manually delete previously indexed # words with delete_word![](/icons/53967kh.gif) function add_stop_word($words) {
(!is_ .gif' />(&$words)) die("KwIndex: syntax: add_stop_word( .gif' /> words)");
(! (&$words)) 1; $linkid = $this->linkid; $idx = $this->index_name; while(list($k, $v) = each($words)) { $lv = strtolower($v);
(!mysql_query("REPLACE INTO ${idx}_stoplist (word) VALUES (". ("'".addslashes($lv)."'"). ")", $linkid)) { $this->ERROR = "Can't add to stoplist: ". mysql_error($linkid);
; } $this->stoplist[ $lv ] = 1; }
1; } # remove stop words from index function remove_stop_word($words) {
(!is_ .gif' />(&$words)) die("KwIndex: syntax: remove_stop_word( .gif' /> words)");
(! (&$words)) 1; $linkid = $this->linkid; $idx = $this->index_name; $e_words = .gif' /> ; while(list($k,$v) = each($words))
.gif' />_push($e_words, "'".addslashes(strtolower($v))."'");
(!mysql_query("DELETE FROM ${idx}_stoplist WHERE word IN (". join(',', $e_words).")", $linkid)) { $this->ERROR = "Can't delete from stoplist: ". mysql_error($linkid);
; } while(list($k,$v) = each($words)) un ($this->stoplist[ strtolower($v) ]);
1; } function is_stop_word($word) {
is ($this->stoplist[ strtolower($word) ]); } function &_search_or_match_count($is_count, &$args) {
(!is ($args["words"])) die("KwIndex: search: option 'words' must be d"); $linkid = $this->linkid; $idx = $this->index_name; # split the words we are offered a single /not .gif' /> # (assume it's a phrase) $words = .gif' /> ;
(is_ .gif' />(&$args["words"])) { $words = $args["words"]; } { $words = $this->_split_to_words($args["words"]); } # delete duplicate words, convert them all to lower![](/icons/53967case.gif) $hashwords = .gif' /> ; while(list($k,$v) = each($words)) $hashwords[ strtolower($v) ] = 1; $words = .gif' />_keys(&$hashwords);
(! (&$words)) {
($is_count) {
0; } {
.gif' /> ; } } # first we retrieve the word ids $op = $args["re"] ? 'REGEXP':'LIKE'; $bool = is ($args['boolean']) && $args['boolean'] && strtoupper($args['boolean']) 'AND' ? 'AND':'OR'; $op_phrases = .gif' /> ; while(list($k,$v) = each($words))
.gif' />_push($op_phrases, "word $op '".addslashes($v)."'");
(!($res0 = mysql_query("SELECT id FROM ${idx}_wordlist WHERE ". join(' OR ', $op_phrases), $linkid))) { $this->ERROR = "Can't retrieve word ids: ".mysql_error($linkid);
; } $word_ids = .gif' /> ; while($row = mysql_fetch_row($res0)) .gif' />_push($word_ids, $row[0]);
(! (&$word_ids) || ($bool 'AND' && (&$word_ids) < (&$words))) {
($is_count) {
0; } {
.gif' /> ; } } # and then we search the vectorlist $can_optimize=0; $stmt = ';
($is_count) {
($bool 'AND' && !$args['re']) { $stmt = 'SELECT did,count(wid) as c '. "FROM ${idx}_vectorlist WHERE wid IN (". join(',',$word_ids). ") ". "GROUP BY did ". "HAVING c >= ". (&$word_ids); } { $can_optimize=1; $stmt = "SELECT COUNT(DISTINCT did) ". "FROM ${idx}_vectorlist WHERE wid IN (". join(',',$word_ids). ")"; } } { // ! $is_count $stmt = "SELECT did, count(wid) as c, avg(f) as a, ". "count(wid)*count(wid)*count(wid)*avg(f) as ca ". "FROM ${idx}_vectorlist WHERE wid IN (". join(',',$word_ids). ") ". "GROUP BY did ". ($bool 'AND' && !$args['re'] ? "HAVING c >= ". (&$word_ids):'). " ORDER BY ca DESC ". (is ($args['num']) ? "LIMIT " . (is ($args['start']) ? (($args['start'] - 1).",".$args['num']) : $args['num']) :'); } _debug("search SQL: ", $stmt);
(!($res = mysql_query($stmt, $linkid))) { $this->ERROR = "Can't search vectorlist: ".mysql_error($linkid);
; }
($is_count) {
($can_optimize) { $row = mysql_fetch_row($res);
$row[0]; } {
mysql_num_rows($res); } } { // ! $is_count $doc_ids = .gif' /> ; while($row = mysql_fetch_row($res)) .gif' />_push($doc_ids, $row[0]);
$doc_ids; } } function &search($args) {
$this->_search_or_match_count(0, &$args); } function &match_count($args) {
$this->_search_or_match_count(1, &$args); } function remove_index { $linkid = $this->linkid; $idx = $this->index_name;
(!mysql_query("DROP TABLE IF EXISTS ${idx}_wordlist", $linkid)) { $this->ERROR = "Can't remove table ${idx}_wordlist: ". mysql_error($linkid);
; }
(!mysql_query("DROP TABLE IF EXISTS ${idx}_doclist", $linkid)) { $this->ERROR = "Can't remove table ${idx}_doclist: ". mysql_error($linkid);
; }
(!mysql_query("DROP TABLE IF EXISTS ${idx}_vectorlist", $linkid)) { $this->ERROR = "Can't remove table ${idx}_vectorlist: ". mysql_error($linkid);
; }
(!mysql_query("DROP TABLE IF EXISTS ${idx}_stoplist", $linkid)) { $this->ERROR = "Can't remove table ${idx}_stoplist: ". mysql_error($linkid);
; }
1; } function empty_index {
$this->remove_index && $this->_create_index ; } # number of documents in the collection function document_count { $linkid = $this->linkid; $idx = $this->index_name;
(!($res = mysql_query("SELECT COUNT(*) FROM ${idx}_doclist", $linkid))) ; $row = mysql_fetch_row($res);
$row[0]; } # number of unique words function word_count { $linkid = $this->linkid; $idx = $this->index_name;
(!($res = mysql_query("SELECT COUNT(*) FROM ${idx}_wordlist", $linkid))) ; $row = mysql_fetch_row($res);
$row[0]; } # PRIVATE METHODS ################# function &_split_to_words($str) { preg_match_all("/b(w[w']*w+|w+)b/", $str, $matches);
$matches[1]; }
>function _create_index { $linkid = $this->linkid; $idx = $this->index_name; $stmt = '; # drop previous tables, they exist
(!$this->remove_index ) ; # create doclist table $stmt = "CREATE TABLE ${idx}_doclist " . '(id ' . $this->_ _column_type($this->doclist_cardinality). ' AUTO_INCREMENT PRIMARY KEY, ' . ' n ' . $this->_ _column_type($this->wordlist_cardinality). ' NOT NULL'. ')';
(!mysql_query($stmt, $linkid)) die("KwIndex: Can't create table ${idx}_doclist: ". mysql_error($linkid)); # create wordlist table $stmt = "CREATE TABLE ${idx}_wordlist " . '(id ' . $this->_ _column_type($this->wordlist_cardinality) . ' AUTO_INCREMENT PRIMARY KEY, ' . " word VARCHAR($this->max_word_length)" . ' BINARY NOT NULL, ' . "UNIQUE (word) " . ')';
(!mysql_query($stmt, $linkid)) die("KwIndex: Can't create table ${idx}_wordlist: ". mysql_error($linkid)); # create stoplist table $stmt = "CREATE TABLE ${idx}_stoplist " . '(id ' . $this->_ _column_type($this->wordlist_cardinality). ' AUTO_INCREMENT PRIMARY KEY, ' . " word VARCHAR($this->max_word_length)" . ' BINARY NOT NULL, ' . "UNIQUE (word) " . ')';
(!mysql_query($stmt, $linkid)) die("KwIndex: Can't create table ${idx}_stoplist: ". mysql_error($linkid)); # create vectorlist table $stmt = "CREATE TABLE ${idx}_vectorlist " . '(wid '. $this->_ _column_type($this->wordlist_cardinality). ' NOT NULL, ' . 'did '. $this->_ _column_type($this->doclist_cardinality). ' NOT NULL, '. 'UNIQUE (wid,did), '. 'f FLOAT(10,4) NOT NULL'. ')';
(!mysql_query($stmt, $linkid)) { die("KwIndex: Can't create table ${idx}_stoplist: ". mysql_error($linkid)); } $this->stoplist = .gif' /> ;
1; } function _ _column_type($cardinality) {
($cardinality >= 16*1024*1024) 'INT UNSIGNED';
($cardinality >= 64*1024) 'MEDIUMINT UNSIGNED';
($cardinality >= 256) 'SMALLINT UNSIGNED';
'TINYINT UNSIGNED'; } function &_index_tables { $idx = $this->index_name; $tables = .gif' />( "${idx}_doclist", "${idx}_wordlist", "${idx}_vectorlist", "${idx}_stoplist");
$tables; } function _index_exists { $linkid = $this->linkid; $idx = $this->index_name;
(!($res = mysql_list_tables($this->db_name, $linkid))) die("Can't list table: ".mysql_error($linkid)); $existing_tables = .gif' /> ; $i = 0; while ($i < mysql_num_rows($res)) { $existing_tables[ mysql_tablename($res, $i) ] = 1; $i ; } $index_tables = $this->_index_tables ; while(list($k,$v) = each($index_tables)) {
(!is ($existing_tables[$v])) 0; }
1; } } // ![](/icons/53967class.gif) ?>
相关文章
读者评论
发表评论
|
|