#| Generic dictionary routines. Author: Dmitri Hrapof Version: 0.4 Copyright (C) 2004, 2005 Dmitri Hrapof This file is part of Geiriadur. Geiriadur is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. Geiriadur is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Geiriadur; see the file COPYING. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |# (in-package geiriadur) (defparameter +welsh-mutation-table+ '(("b" "p") ("c" ((nil nil) ("h" "c"))) ("d" ((nil "t") ("d" "d"))) ("f" "b" "m") ("g" "c") ("h" "") ("l" ((nil "ll" "gl") ("l" nil))) ("m" ((nil "b") ("h" "p"))) ("n" ((nil "d") ("g" ((nil "g") ("h" "c"))) ("h" "t"))) ("p" ((nil nil) ("h" "p"))) ("r" ((nil "rh" "gr") ("h" nil))) ("s" nil) ;to avoid searching for gsaith, etc. ("t" ((nil nil) ("h" "t"))) (nil "g"))) (defparameter +irish-mutation-table+ '(("b" ((nil nil) ("h" ((nil "b") ("f" "f"))) ("p" "p"))) ("c" ((nil nil) ("h" "c"))) ("d" ((nil nil) ("h" "d") ("t" "t"))) ("f" ((nil nil) ("h" "f"))) ("g" ((nil nil) ("c" "c") ("h" "g"))) ("h" "") ("l" nil) ("m" ((nil nil) ("b" "b") ("h" "m"))) ("n" ((nil nil) ("-" "") ("d" "d") ("g" "g"))) ("p" ((nil nil) ("h" "p"))) ("r" nil) ("s" ((nil nil) ("h" "s"))) ("t" ((nil "") ("h" "t") ("r" nil))) (nil nil))) (defparameter +breton-mutation-table+ '(("b" "p") ("c'h" "g" "k") ("d" "t") ("f" "p") ("g" "k") ("k" "g") ("p" "b") ("t" "d") ("v" "b" "m") ("w" "gw") ("z" "d" "t"))) (defparameter +mutations+ `((1 . ,+welsh-mutation-table+) (3 . ,+irish-mutation-table+) (4 . ,+breton-mutation-table+))) (defvar *p-count* nil) (defvar *s-count* nil) (defvar *p-by-1st* nil) (defvar *p-by-2nd* nil) (defvar *s-by-id* (make-hash-table)) (defvar *f-by-word* (make-hash-table)) (defvar *x-by-tran* (make-hash-table)) (defvar *languages* (make-hash-table)) (defvar *date* nil) (defparameter *conn* nil) (defparameter *gone* nil) (defparameter *r5k* 0.0) (defparameter *r30k* 0.0) (defparameter *c5k* 0.0) (defparameter *c30k* 0.0) (defstruct lang id name mutable maxelen (wcount -1) (sree (make-instance 'coeden:burkhard-keller-tree)) (free (make-instance 'coeden:burkhard-keller-tree)) (sbyv (make-hash-table :test #'equal)) (fbyv (make-hash-table :test #'equal)) (ebyv (make-hash-table :test #'equal)) (kbyv (make-hash-table :test #'equal)) (ebyp (make-hash-table)) (kbyw (make-hash-table))) (declaim (inline ins del sub)) (defun ins (x i) (declare (ignore x i)) 1) (defun del (x i) (declare (ignore x i)) 1) (defun sub (x i y j) (if (char= (schar x i) (schar y j)) 0 1)) (defun levenshtein-metric (x y) (let ((m (length x)) (n (length y))) (declare (type fixnum m n)) (let ((tt (make-array `(,(1+ m) ,(1+ n)) :element-type 'fixnum))) (setf (aref tt 0 0) 0) (dotimes (j n) (setf (aref tt 0 (1+ j)) (the fixnum (+ (ins y j) (aref tt 0 j))))) (dotimes (i m) (let ((i1 (1+ i))) (setf (aref tt i1 0) (the fixnum (+ (del x i) (aref tt i 0)))) (dotimes (j n) (let ((j1 (1+ j))) (setf (aref tt i1 j1) (the fixnum (min (the fixnum (+ (aref tt i j) (sub x i y j))) (the fixnum (+ (del x i) (aref tt i j1))) (the fixnum (+ (ins y j) (aref tt i1 j)))))))))) (aref tt m n)))) (defun list-languages () (let (ri) (maphash #'(lambda (i l) (push (cons i (lang-name l)) ri)) *languages*) (sort ri #'< :key #'car))) (defun make-stats (st) (format st " Статистика словаря

Статистика словаря на ~2,,,'0@a/~2,,,'0@a/~a

Число слов

~%" (nth 3 *date*) (nth 4 *date*) (nth 5 *date*)) (let ((llist (list-languages))) (dolist (lng llist) (format st "~%" (cdr lng) (lang-wcount (gethash (car lng) *languages*)))) (format st "
~a~a

Число слов с переводами

") (dolist (lng llist) (format st "" (cdr lng))) (format st "~%") (dolist (lng llist) (format st "" (cdr lng)) (dolist (gnl llist) (format st "" (aref *s-count* (car lng) (car gnl)))) (format st "~%")) (format st "
~a
~a~a
~%") ; (format st "

Число прямых переводов

; ;") ; (dolist (lng llist) (format st "" (cdr lng))) ; (format st "~%") ; (dolist (lng llist) ; (format st "" (cdr lng)) ; (dolist (gnl llist) ; (format st "" (aref *p-count* (car lng) (car gnl)))) ; (format st "~%"))) ; (format st "
~a
~a~a
")) (format st "

Процент покрытия наиболее частотных слов

500030000
Русский → Cymraeg~,2f% ~,2f%
Cymraeg → Русский~,2f% ~,2f%
" (/ *r5k* 50.0) (/ *r30k* 300.0) (/ *c5k* 50.0) (/ *c30k* 300.0)) (format st "COA"))) (defun unm-i (word rule) (let ((ord (if (car rule) (subseq word (length (car rule))) word))) (remove-if #'not (mapcar #'(lambda (b) (if b (concatenate 'string b ord))) (cdr rule))))) (defun unmutate (word table) (if table (let ((b (find-if #'(lambda (k) (and k (eql (search k word) 0))) table :key #'car))) (if b (let ((pt (second b))) (if (and pt (listp pt)) (unmutate (subseq word (length (car b))) pt) (unm-i word b))) (unm-i word (find-if-not #'car table)))))) (defun find-by-word (word from) (gethash word (lang-sbyv from))) (defun find-by-form (word from) (mapcar #'(lambda (w) (append (gethash (second w) *s-by-id*) (list (cons 0 w)))) ; (gethash word (lang-fbyv from)))) (pg:pg-result (pg:pg-exec *conn* (format nil (if (and (zerop (lang-id from)) (position #\е word)) "select f.* from f, s where f.word=s.id and s.lang=~a and f.val~~~/gdr:jo/" "select f.* from f, s where f.word=s.id and s.lang=~a and f.val=~/gdr:stroqa/") (lang-id from) word)) :tuples))) (defun blow-up (word maxl) (let (res (wlen (length word))) (dotimes (i (min wlen (+ maxl 1)) res) (let ((rlen (- wlen i))) (push (cons (subseq word 0 rlen) (subseq word rlen)) res))))) (defun walker (env exp) (if (atom exp) (cond ((member exp '(or and not t)) exp) ((member exp env) t) (t nil)) (mapcar #'(lambda (e) (walker env e)) exp))) #+clisp (defun scheme (exp env) (handler-case (handler-bind ((unbound-variable #'(lambda (c) (declare (ignore c)) (use-value nil)))) (eval `(let ,(mapcar #'(lambda (s) `(,s t)) env) ,exp))) (condition () nil))) #-clisp (defun scheme (exp env) (ignore-errors (eval (walker env exp)))) (defun find-by-flexion (word from) (let (res) (dolist (dbl (blow-up word (lang-maxelen from)) res) (dolist (otup (gethash (cdr dbl) (lang-ebyv from))) (dolist (ktup (gethash (car dbl) (lang-kbyv from))) (let ((s (gethash (second ktup) *s-by-id*))) (if (and (= (third otup) (third s)) (scheme (fourth ktup) (fifth otup))) (push (append s (list `(,(first otup) ,(second otup) ,(third otup) ,word ,(seventh otup) ,(sixth otup)))) res)))))))) (defun guess-by-word (word from) (mapcar #'(lambda (p) (coeden:tree-fruit (second p))) (coeden:gather (lang-sree from) word #'levenshtein-metric 1 :key #'fourth))) (defun guess-by-form (word from) (mapcar #'(lambda (p) (let ((f (coeden:tree-fruit (second p)))) (append (gethash (second f) *s-by-id*) (list (cons 0 f))))) ; (coeden:gather (lang-free from) word #'levenshtein-metric 1 :key #'third))) nil)) (defun conventional-find (w f) (apply #'append (mapcar #'(lambda (w) (append (find-by-word w f) (find-by-form w f) (find-by-flexion w f))) (cons w (unmutate w (lang-mutable f)))))) (defun unconventional-find (w f) (apply #'append (mapcar #'(lambda (w) (append (guess-by-word w f) (guess-by-form w f))) (cons w (unmutate w (lang-mutable f)))))) (defun dry-n-swansick (word from) "поиск приставок потом" (apply #'append (mapcar #'(lambda (w) (or (conventional-find w from) (and (string/= w (string-downcase w)) (conventional-find (string-downcase w) from)) (unconventional-find w from))) (remove 0 (ppcre:split "[ ,.?!;:]+" word) :key #'length)))) (defun dod-o-hyd (word from) (let ((found (make-hash-table)) (tails (make-hash-table))) (dolist (w (dry-n-swansick word (gethash from *languages*))) (if (null (gethash (car w) found)) (setf (gethash (car w) found) (subseq w 0 6))) (if (seventh w) (push (seventh w) (gethash (car w) tails)))) (loop for w being the hash-values of found collect (append w (list (gethash (car w) tails)))))) (defgeneric select-examples (conn tran)) (defmethod select-examples (conn tran) (declare (ignore conn)) (gethash tran *x-by-tran*)) (defgeneric select-translations (conn word from to fwrd)) (defmethod select-translations (conn word from to fwrd) (declare (ignore conn)) (let ((ptable (if fwrd (aref *p-by-1st* from to) (aref *p-by-2nd* from to)))) (apply #'append (mapcar #'(lambda (w) (let ((trans (gethash (car w) ptable))) (if trans (append (mapcar #'(lambda (f) `(0 ,@(subseq w 0 6) ,@f nil 0)) (seventh w)) (mapcar #'(lambda (c) `(,(first c) ,@(if (= (car w) (cadr c)) (subseq w 0 6) (gethash (second c) *s-by-id*)) ,@(if (= (car w) (third c)) (subseq w 0 6) (gethash (third c) *s-by-id*)) ,(fourth c) ,(fifth c))) trans))))) (dod-o-hyd word from))))) (defgeneric find-translations (conn word to)) (defmethod find-translations (conn word to) (declare (ignore conn)) (let ((w (gethash word *s-by-id*))) (mapcar #'(lambda (c) `(,(first c) ,@w ,@(gethash (third c) *s-by-id*) ,(fourth c) ,(fifth c))) (gethash (car w) (aref *p-by-1st* (second w) to))))) (defgeneric cross-translations (conn word from to via)) (defmethod cross-translations (conn word from to via) (let ((ptable (aref *p-by-1st* via to)) (porevo (select-translations conn word from via t))) (append (remove-if-not #'zerop porevo :key #'car) (remove-duplicates (reverse (apply #'append (mapcar #'(lambda (c) (mapcar #'(lambda (cc) `(1 ,@(subseq c 1 7) ,@(gethash (third cc) *s-by-id*) ,(format nil "~@[~a|~]{~a}" (fourth cc) (nth 10 c)) 0)) (gethash (nth 7 c) ptable))) (remove 0 porevo :key #'car)))) :key #'eighth)))) (defgeneric find-forms (conn word)) (defun generate-forms (word) (let ((lng (gethash (second word) *languages*)) (res ())) (let ((end (gethash (third word) (lang-ebyp lng)))) (dolist (k (gethash (first word) (lang-kbyw lng)) res) (dolist (e end) (if (scheme (fourth k) (fifth e)) (push `(0 ,(second word) ,(third word) ,(format nil "~a~a" (third k) (fourth e)) ,(seventh e) "") res))))))) (defmethod find-forms (conn word) (declare (ignore conn)) (let ((w (gethash word *s-by-id*))) `(,w ,@(mapcar #'(lambda (f) `(,(first f) ,(second w) ,(third w) ,@(cddr f))) (gethash word *f-by-word*)) ,@(generate-forms w)))) (defgeneric find-words (conn word from)) (defmethod find-words (conn word from) (declare (ignore conn)) (find-by-word word (gethash from *languages*))) (defun hier (lst tr) (if lst (if (= (caaar lst) (cadr tr)) (progn (push tr (cadar lst)) lst) (push `(,(cdr tr) (,tr)) lst)) `((,(cdr tr) (,tr))))) (defun tt< (p1 p2) (if (= (second p1) (second p2)) (cond ((and (zerop (car p1)) (zerop (car p2))) t) ((and (zerop (car p1)) (not (zerop (car p2)))) nil) ((and (zerop (car p2)) (not (zerop (car p1)))) t) (t (> (nth 14 p1) (nth 14 p2)))) (< (second p1) (second p2)))) (defun sort-translations (tt) (reduce #'hier (sort tt #'tt<) :initial-value nil)) (defun acquire-translations (word from &optional (to 0) &key reverse via) (let ((c (if *gone* *conn*))) (sort-translations (cond ((not (stringp word)) (find-translations c word from)) ;ugly hack (via (if (and (= via 5) (= from 4) (= to 0)) (llydaweg (if *gone* *conn*) word) (cross-translations c word from to via))) (t (select-translations c word from to (not reverse))))))) (defun brezhoneg (w &optional (part 0) via) (let ((drakma:*drakma-default-external-format* :utf-8)) (remove-duplicates (apply #'append (mapcar #'(lambda (fr) ;(format t "~a~%" fr) (apply #'append (mapcar #'(lambda (n) (mapcar #'(lambda (ru) (let ((r (string-trim " " (ppcre:regex-replace "\\(.*$" (ppcre:regex-replace "\\[[^]]*\\]" (ppcre:regex-replace "ё" ru "е") "") "")))) (if via (list r fr) r))) (remove 0 (ppcre:split "[;,] *" (string-trim "() " (stp:data n))) :key #'length))) (let ((yandex (chtml:parse (drakma:http-request (format nil "http://lingvo.yandex.ru/~a/%D1%81%20%D1%84%D1%80%D0%B0%D0%BD%D1%86%D1%83%D0%B7%D1%81%D0%BA%D0%BE%D0%B3%D0%BE/" (drakma::url-encode fr :utf-8))) (stp:make-builder)))) (xpath:with-namespaces (("html" "http://www.w3.org/1999/xhtml")) (or (xpath:all-nodes (xpath:evaluate "//html:li[@id]/text() | //html:li[@id]/html:span/text() | //html:li[@id]/html:a/text()" yandex)) (xpath:all-nodes (xpath:evaluate "//html:div[@class='b-translate__value']/html:i/text()" yandex)))))))) (let ((b (let* ((drakma:*drakma-default-external-format* :iso-8859-1) (ns (xpath:all-nodes (xpath:with-namespaces (("html" "http://www.w3.org/1999/xhtml")) (xpath:evaluate "//html:p[contains(.,'][')]" (chtml:parse (drakma:http-request "http://www.agencebretagnepresse.com/cgi-bin/dico.cgi" :parameters `(("dico" . "breton") ("key" . ,(substitute #\n #\ñ w)))) (stp:make-builder))))))) (if (> 2 (length ns)) (car ns) (second (car (remove-if-not #'(lambda (p) (= part (first p))) (mapcar #'(lambda (n) (let ((][ (or (ignore-errors (stp:data (stp:nth-child 1 (stp:first-child n)))) ""))) (format t "~a~%" ][) (list (cond ((search "-où" ][) 1) ((search "-ioù" ][) 1) ((search " ad." ][) 2) ((search "vb." ][) 0) ((search " g." ][) 1) ((search " b." ][) 1) (t 0)) n))) ns)))))))) (when b (remove 0 (apply #'append (mapcar #'(lambda (bbb) (mapcar #'(lambda (r) (string-trim " )" (ppcre:regex-replace "\\([^)]*(\\)|$)" r ""))) (ppcre:split ", *" (stp:data (stp:first-child bbb))))) (cdr (stp:filter-recursively #'(lambda (c) (when (and (eq 'stp:element (type-of c)) (string= "b" (stp:local-name c))) (let ((p (ignore-errors (stp:previous-sibling c)))) (or (null p) (not (eq 'stp:text (type-of p))) (zerop (length (stp:data p))) (char/= #\- (char (stp:data p) (1- (length (stp:data p))))))))) b)))) :key #'length))))) :test #'string= :key (if via #'first #'identity) :from-end t))) ;1, s.id, s.lang, s.part, s.val, s.attr, s.pron, ss.id, ss.lang, ss.part, ss.val, ss.attr, ss.pron, coalesce(p.comment,'')||'|{'||sss.val||'}', 0 (defun llydaweg (conn word) (nreverse (apply #'append (mapcar #'(lambda (br) (mapcar #'(lambda (ru) (let ((rus (car (find-words conn (car ru) 0)))) (if rus `(1 ,@br ,@rus ,(format nil "{~a}" (cadr ru)) 0) `(1 ,@br 0 0 0 nil nil nil ,(format nil "~a|{~a}" (car ru) (cadr ru)) 0)))) (ignore-errors (brezhoneg (fourth br) (third br) t)))) (if conn (find-words conn word 4) (mapcar #'(lambda (w) (subseq w 0 6)) (dod-o-hyd word 4))))))) (defun acquire-words (word from) (find-words (if *gone* *conn*) word from)) (defun acquire-forms (word) (find-forms *conn* word)) (defun acquire-examples (tran) (select-examples (if *gone* *conn*) tran)) (defun dump (f) (with-open-file (fd f :direction :output :if-exists :supersede :if-does-not-exist :create) (format fd "~s ~s ~s ~s ~s ~s ~s ~s" *p-count* *p-by-1st* *p-by-2nd* *s-by-id* *f-by-word* *x-by-tran* *languages* *date*))) (defun slurp (f) (with-open-file (fd f :direction :input) (setf *p-count* (read fd)) (setf *p-by-1st* (read fd)) (setf *p-by-2nd* (read fd)) (setf *s-by-id* (read fd)) (setf *f-by-word* (read fd)) (setf *x-by-tran* (read fd)) (setf *languages* (read fd)) (setf *date* (read fd))))