2012/02/19

Common Lisp で全文検索 Web システム

仕事で使えないかなと思って、 Common Lisp で全文検索 Web システムを作ってみたが、あまり速くなかった。

MontezumaMeCab のトークナイザを付けて、 IOLib 使ったイベント方式の Web サーバでインターフェースを提供する実装。

Montezuma をファイルベースのインデックにすると厳しい。インメモリのインデックスならチューニングすればいけるかも。

(eval-when (:compile-toplevel :load-toplevel :execute)
(ql:quickload :montezuma)
(ql:quickload :clsql)
(ql:quickload :info.read-eval-print.series-ext)
(ql:quickload :info.read-eval-print.mecab)
(ql:quickload :info.read-eval-print.web-server))

(info.read-eval-print.series-ext:sdefpackage
:fts (:use :cl :info.read-eval-print.mecab
:info.read-eval-print.web-server))

(in-package :fts)


;;; tokenizer
(defclass mecab-tokenizer (montezuma::tokenizer)
((tokens :initform :unbound)
(position :initform 0)))

(defmethod montezuma::next-token :before ((self mecab-tokenizer))
(with-slots (montezuma::input tokens) self
(when (eq tokens :unbound)
(setf tokens
(typecase montezuma::input
(stream
(mecab (collect 'string (scan-stream montezuma::input #'read-char))))
(t
(mecab montezuma::input)))))))

(defmethod montezuma::next-token ((self mecab-tokenizer))
(with-slots (tokens position) self
(let ((token (pop tokens)))
(if token
(let* ((表層形 (node-表層形 token))
(new-position (+ position (length 表層形)))
(token (or (node-原形 token) 表層形)))
(prog1 (montezuma::make-token
(montezuma::normalize self token)
position new-position)
(setf position new-position)))
nil))))

(defmethod montezuma::normalize ((self mecab-tokenizer) str)
str)

(defun search-full-text (query)
(collect
(montezuma:document-value
(montezuma:get-document
*index*
(montezuma:doc
(scan (montezuma::score-docs
(montezuma:search *index* (format nil "text:~a" query)
:num-docs 5000)))))
"id")))
;;(search-full-text "京都")

(defun /search ()
(let ((q (params :q)))
(info.read-eval-print.web-server::make-response
(if q
(search-full-text q)
""))))

(defun search-by-id (id)
(montezuma:search-each *index* (format nil "id:~a" id)
(lambda (doc score)
(declare (ignore score))
(print doc)
(return-from search-by-id doc)))
nil)

(defun add-index (id text)
(let ((doc-num (search-by-id id)))
(if doc-num
(montezuma:update *index* doc-num `(("text" . ,text)))
(montezuma:add-document-to-index
*index*
`(("id" . ,id)
("text" . ,text))))
(info.read-eval-print.web-server::make-response "ok")))

(defun /add ()
(let ((id (params :id))
(text (params :text)))
(add-index id text)
(info.read-eval-print.web-server::make-response "ok")))

(defun run ()
(sb-thread:make-thread (lambda ()
(let ((*handler-package* :fts))
(start)))
:name "full text search"))


;;; analyzer
(defclass mecab-analyzer (montezuma::standard-analyzer)
())

(defmethod montezuma::token-stream ((self mecab-analyzer) field string-or-stream)
(declare (ignore field))
(reduce (lambda (acc x) (make-instance x :input acc))
'(mecab-tokenizer
montezuma::lowercase-filter
montezuma::stop-filter)
:initial-value string-or-stream))



;;; index 作成
(defparameter *index*
(make-instance 'montezuma:index
:path "/tmp/montezuma" ; 指定しなければインメモリになる
:analyzer (make-instance 'mecab-analyzer)))


(defun make-initial-index ()
(progn
(clsql-sys:connect '("localhost" "outing_development" "root" "")
:database-type :mysql)
(clsql-sys:execute-command "set names utf8"))
(time
(clsql:loop for (id name kana description address search-keyword prefecture-id)
being the tuples of
"select id name, kana, description, address, search_keyword, prefecture_id
from facilities"

for text = (delete #\return (format nil "~@{~a~^ ~}" name kana description address search-keyword))
do (print id)
do (add-index id text))))


#|
http://localhost:8888/add?id=1&text=京都に行こう。
http://localhost:8888/add?id=2&text=東京都に行こう。
http://localhost:8888/add?id=3&text=京都府に行こう。
http://localhost:8888/search?q=京都

(time (labels ((f (q)
(cons q (search-full-text q))))
(dotimes (i 100)
(collect (f (scan '("京都" "東都" "京" "都" "子供")))))))

|#

0 件のコメント: