Common Lisp で実装された全文検索エンジン Montezuma
Common Lisp で実装された全文検索エンジン Montezuma を動かす。
MeCab を使った analyzer を実装して、日本語を扱えるところまで確認した。
タイトルやボディ等の構造を持った文書も扱える。検索は and, or, not, 部分一致等結構柔軟。ただし、範囲検索(2010-4-26〜2010-9-26のような)はできない。しっかりちゃんと動く。十分実務に使えると思った。
(eval-when (:compile-toplevel :load-toplevel :execute)
(require :montezuma))
(defpackage :try-montezuma
(:use :cl))
(in-package :try-montezuma)
;;; MeCab
(defun run-mecab (stream)
(with-output-to-string (out)
(sb-ext:run-program "mecab" nil :search t :input stream :output out)))
(defgeneric mecabaku (input)
(:method ((stream stream))
(with-input-from-string (in (run-mecab stream))
(loop for i = (read-line in)
until (string= "EOS" i) collect (collect-mecabu-output i))))
(:method ((string string))
(with-input-from-string (stream string)
(mecabaku stream))))
(defmacro collect-mecabu-output-macro ()
(let ((x '(表層形 品詞 品詞細分類1 品詞細分類2 品詞細分類3 活用形 活用型 原形 読み 発音)))
`(defun collect-mecabu-output (one-line)
(ppcre:register-groups-bind ,x
("(.*)\\t([^,]+),([^,]+),([^,]+),([^,]+),([^,]+),([^,]+)(?:,([^,]+),([^,]+),([^,]+))?"
one-line)
(list ,@(loop for i in x
append `(',i ,i)))))))
(collect-mecabu-output-macro)
;;(mecabaku "桜が咲いた。Hello")
;;(loop for i in (mecabaku "桜が咲いた。") collect (getf i '原形))
;; => ("桜" "が" "咲く" "た" "。")
;;; tokenizer
(defclass mecab-tokenizer (montezuma::tokenizer)
((tokens :initform :unbound)
(position :initform 0)))
(defmethod montezuma::next-token :before ((self mecab-tokenizer))
(with-slots (montezuma::input tokens) self
(when (eq tokens :unbound)
(setf tokens (mecabaku montezuma::input)))))
(defmethod montezuma::next-token ((self mecab-tokenizer))
(with-slots (tokens position) self
(let ((token (pop tokens)))
(if token
(let* ((表層形 (getf token '表層形))
(new-position (+ position (length 表層形)))
(token (or (getf token '原形) 表層形)))
(prog1 (montezuma::make-token
(montezuma::normalize self token)
position new-position)
(setf position new-position)))
nil))))
(defmethod montezuma::normalize ((self mecab-tokenizer) str)
str)
;;; analyzer
(defclass mecab-analyzer (montezuma::standard-analyzer)
())
(defmethod montezuma::token-stream ((self mecab-analyzer) field string-or-stream)
(declare (ignore field))
(reduce (lambda (acc x) (make-instance x :input acc))
'(mecab-tokenizer
montezuma::lowercase-filter
montezuma::stop-filter)
:initial-value string-or-stream))
;;; index 作成
(defparameter *index*
(make-instance 'montezuma:index
:path "/tmp/montezuma"
:analyzer (make-instance 'mecab-analyzer)))
;;; 使ってみる。
(loop for i in '("貘の鼻は長い。" "亀の甲羅は固い。" "This is a new document to be indexed"
(("title" . "Programming Lisp")
("content" . "car cdr cons loop is"))
(("title" . "Programming Forth")
("content" . "dup swap is")))
do (montezuma:add-document-to-index *index* i))
(loop for i in '("貘" "の" "甲羅" "長い" "貘の鼻" "貘 鼻 長さ"
"this" "to be indexed" "is"
"title:Lisp" "Lisp")
collect (montezuma:search-each
*index* i (lambda (doc score)
(print (list i doc score)))))
;; + をつけると must
(loop for i in '("title:Lisp"
"title:Lisp content:xaxa" ; or
"title:Lisp +content:car" ; and
"title:Lisp +content:xaxa" ; and
"title:\"Programming Forth\"" ; フレーズ
)
for acc = nil
collect (progn
(montezuma:search-each
*index* i
(lambda (doc score)
(push (montezuma:document-value (montezuma:get-document *index* doc)
"content")
acc)))
acc))
0 件のコメント:
コメントを投稿