diff --git a/config/mime.json b/config/mime.json index 2bf1be05..27a062fc 100644 --- a/config/mime.json +++ b/config/mime.json @@ -67,6 +67,7 @@ "m3u8": "application/vnd.apple.mpegurl", "m4a": "audio/x-m4a", "m4v": "video/x-m4v", + "md": "text/markdown", "mdc": "image/x-minolta-mdc", "mef": "image/x-mamiya-mef", "mid": "audio/midi", diff --git a/server/model/formater/office.go b/server/model/formater/office.go new file mode 100644 index 00000000..be6f6a2f --- /dev/null +++ b/server/model/formater/office.go @@ -0,0 +1,82 @@ +package formater + +import ( + "archive/zip" + "bytes" + "encoding/xml" + "fmt" + . "github.com/mickael-kerjean/filestash/server/common" + "io" + "math/rand" + "os" + "regexp" + "strings" +) + +func OfficeFormater(r io.ReadCloser) (io.ReadCloser, error) { + tmpName := fmt.Sprintf("/tmp/docx_%d.docx", rand.Intn(1000000)) + defer os.Remove(tmpName) + f, err := os.OpenFile(tmpName, os.O_CREATE | os.O_WRONLY, os.ModePerm) + if err != nil { + return nil, err + } + _, err = io.Copy(f, r) + if err != nil { + return nil, err + } + z, err := zip.OpenReader(tmpName) + if err != nil { + return nil, err + } + defer z.Close() + + hasData := false + content := bytes.NewBuffer([]byte{}) + for _, f := range z.File { + shouldExtract := false + if f.Name == "word/document.xml" { shouldExtract = true } + if strings.HasPrefix(f.Name, "ppt/slides/slide") { shouldExtract = true } + + if shouldExtract == false { + continue + } + hasData = true + o, err := f.Open() + if err != nil { + return nil, err + } + dec := xml.NewDecoder(o) + for { + t, err := dec.Token() + if err != nil { + break + } + if t == nil { + break + } + switch el := t.(type) { + case xml.StartElement: + if el.Name.Local == "t" { + w := WordDoc{} + dec.DecodeElement(&w, &el) + if len(w.Text) > 0 { + w.Text = regexp.MustCompile("\\s+\\.\\s+").ReplaceAll(w.Text, []byte(". ")) + w.Text = regexp.MustCompile("\\s{2,}").ReplaceAll(w.Text, []byte(" ")) + content.Write(w.Text) + content.Write([]byte(" ")) + } + } + } + } + } + + if hasData == false { + return nil, ErrNotFound + } + return NewReadCloserFromReader(content), nil +} + + +type WordDoc struct { + Text []byte `xml:",innerxml"` +} diff --git a/server/model/formater/pdf.go b/server/model/formater/pdf.go new file mode 100644 index 00000000..38c78054 --- /dev/null +++ b/server/model/formater/pdf.go @@ -0,0 +1,35 @@ +package formater + +import ( + "bytes" + "fmt" + . "github.com/mickael-kerjean/filestash/server/common" + "io" + "math/rand" + "os" + "os/exec" +) + +func PdfFormater(r io.ReadCloser) (io.ReadCloser, error) { + tmpName := fmt.Sprintf("/tmp/pdf_%d.docx", rand.Intn(1000000)) + defer os.Remove(tmpName) + f, err := os.OpenFile(tmpName, os.O_CREATE | os.O_WRONLY, os.ModePerm) + if err != nil { + return nil, err + } + _, err = io.Copy(f, r) + if err != nil { + f.Close() + return nil, err + } + f.Close() + + cmd := exec.Command("pdftotext", tmpName, "-") + out := bytes.NewBuffer([]byte{}) + cmd.Stdout = out + err = cmd.Run() + if err != nil { + return nil, err + } + return NewReadCloserFromReader(out), nil +} diff --git a/server/model/formater/txt.go b/server/model/formater/txt.go new file mode 100644 index 00000000..b2312843 --- /dev/null +++ b/server/model/formater/txt.go @@ -0,0 +1,9 @@ +package formater + +import ( + "io" +) + +func TxtFormater(rc io.ReadCloser) (io.ReadCloser, error) { + return rc, nil +} diff --git a/server/model/search.go b/server/model/search.go index 5b95e0d3..604d133a 100644 --- a/server/model/search.go +++ b/server/model/search.go @@ -6,7 +6,9 @@ import ( "encoding/base64" "github.com/mattn/go-sqlite3" . "github.com/mickael-kerjean/filestash/server/common" + "github.com/mickael-kerjean/filestash/server/model/formater" "hash/fnv" + "io" "math/rand" "path/filepath" "regexp" @@ -485,7 +487,43 @@ func(this *SearchIndexer) Discover() bool { } func(this *SearchIndexer) Indexing() bool { - return false + var path string + err := this.db.QueryRow( + "SELECT path FROM file WHERE (" + + " type = 'file' AND size < 512000 AND filetype = 'txt' AND indexTime IS NULL" + + ") LIMIT 1;", + ).Scan(&path) + if err != nil { + return false + } + defer this.db.Exec( + "UPDATE file SET indexTime = ? WHERE path = ?", + time.Now(), path, + ) + + mime := GetMimeType(path) + var reader io.ReadCloser + reader, err = this.Backend.Cat(path) + if err != nil { + return false + } + defer reader.Close() + switch mime { + case "text/plain": reader, err = formater.TxtFormater(reader) + case "text/org": reader, err = formater.TxtFormater(reader) + case "text/markdown": reader, err = formater.TxtFormater(reader) + case "application/pdf": reader, err = formater.PdfFormater(reader) + case "application/powerpoint": reader, err = formater.OfficeFormater(reader) + case "application/vnd.ms-powerpoint": reader, err = formater.OfficeFormater(reader) + case "application/word": reader, err = formater.OfficeFormater(reader) + case "application/msword": reader, err = formater.OfficeFormater(reader) + default: return true + } + if err != nil { + Log.Warning("search::indexing formater_error (%v)", err) + return true + } + return true } func(this SearchIndexer) Bookkeeping() bool {