feature (search): content indexing: pdf, docx, txt, pptx, org

This commit is contained in:
= 2019-04-04 19:33:11 +11:00
parent e88758567d
commit 4ba5a7c346
5 changed files with 166 additions and 1 deletions

View file

@ -67,6 +67,7 @@
"m3u8": "application/vnd.apple.mpegurl",
"m4a": "audio/x-m4a",
"m4v": "video/x-m4v",
"md": "text/markdown",
"mdc": "image/x-minolta-mdc",
"mef": "image/x-mamiya-mef",
"mid": "audio/midi",

View file

@ -0,0 +1,82 @@
package formater
import (
"archive/zip"
"bytes"
"encoding/xml"
"fmt"
. "github.com/mickael-kerjean/filestash/server/common"
"io"
"math/rand"
"os"
"regexp"
"strings"
)
func OfficeFormater(r io.ReadCloser) (io.ReadCloser, error) {
tmpName := fmt.Sprintf("/tmp/docx_%d.docx", rand.Intn(1000000))
defer os.Remove(tmpName)
f, err := os.OpenFile(tmpName, os.O_CREATE | os.O_WRONLY, os.ModePerm)
if err != nil {
return nil, err
}
_, err = io.Copy(f, r)
if err != nil {
return nil, err
}
z, err := zip.OpenReader(tmpName)
if err != nil {
return nil, err
}
defer z.Close()
hasData := false
content := bytes.NewBuffer([]byte{})
for _, f := range z.File {
shouldExtract := false
if f.Name == "word/document.xml" { shouldExtract = true }
if strings.HasPrefix(f.Name, "ppt/slides/slide") { shouldExtract = true }
if shouldExtract == false {
continue
}
hasData = true
o, err := f.Open()
if err != nil {
return nil, err
}
dec := xml.NewDecoder(o)
for {
t, err := dec.Token()
if err != nil {
break
}
if t == nil {
break
}
switch el := t.(type) {
case xml.StartElement:
if el.Name.Local == "t" {
w := WordDoc{}
dec.DecodeElement(&w, &el)
if len(w.Text) > 0 {
w.Text = regexp.MustCompile("\\s+\\.\\s+").ReplaceAll(w.Text, []byte(". "))
w.Text = regexp.MustCompile("\\s{2,}").ReplaceAll(w.Text, []byte(" "))
content.Write(w.Text)
content.Write([]byte(" "))
}
}
}
}
}
if hasData == false {
return nil, ErrNotFound
}
return NewReadCloserFromReader(content), nil
}
type WordDoc struct {
Text []byte `xml:",innerxml"`
}

View file

@ -0,0 +1,35 @@
package formater
import (
"bytes"
"fmt"
. "github.com/mickael-kerjean/filestash/server/common"
"io"
"math/rand"
"os"
"os/exec"
)
func PdfFormater(r io.ReadCloser) (io.ReadCloser, error) {
tmpName := fmt.Sprintf("/tmp/pdf_%d.docx", rand.Intn(1000000))
defer os.Remove(tmpName)
f, err := os.OpenFile(tmpName, os.O_CREATE | os.O_WRONLY, os.ModePerm)
if err != nil {
return nil, err
}
_, err = io.Copy(f, r)
if err != nil {
f.Close()
return nil, err
}
f.Close()
cmd := exec.Command("pdftotext", tmpName, "-")
out := bytes.NewBuffer([]byte{})
cmd.Stdout = out
err = cmd.Run()
if err != nil {
return nil, err
}
return NewReadCloserFromReader(out), nil
}

View file

@ -0,0 +1,9 @@
package formater
import (
"io"
)
func TxtFormater(rc io.ReadCloser) (io.ReadCloser, error) {
return rc, nil
}

View file

@ -6,7 +6,9 @@ import (
"encoding/base64"
"github.com/mattn/go-sqlite3"
. "github.com/mickael-kerjean/filestash/server/common"
"github.com/mickael-kerjean/filestash/server/model/formater"
"hash/fnv"
"io"
"math/rand"
"path/filepath"
"regexp"
@ -485,7 +487,43 @@ func(this *SearchIndexer) Discover() bool {
}
func(this *SearchIndexer) Indexing() bool {
return false
var path string
err := this.db.QueryRow(
"SELECT path FROM file WHERE (" +
" type = 'file' AND size < 512000 AND filetype = 'txt' AND indexTime IS NULL" +
") LIMIT 1;",
).Scan(&path)
if err != nil {
return false
}
defer this.db.Exec(
"UPDATE file SET indexTime = ? WHERE path = ?",
time.Now(), path,
)
mime := GetMimeType(path)
var reader io.ReadCloser
reader, err = this.Backend.Cat(path)
if err != nil {
return false
}
defer reader.Close()
switch mime {
case "text/plain": reader, err = formater.TxtFormater(reader)
case "text/org": reader, err = formater.TxtFormater(reader)
case "text/markdown": reader, err = formater.TxtFormater(reader)
case "application/pdf": reader, err = formater.PdfFormater(reader)
case "application/powerpoint": reader, err = formater.OfficeFormater(reader)
case "application/vnd.ms-powerpoint": reader, err = formater.OfficeFormater(reader)
case "application/word": reader, err = formater.OfficeFormater(reader)
case "application/msword": reader, err = formater.OfficeFormater(reader)
default: return true
}
if err != nil {
Log.Warning("search::indexing formater_error (%v)", err)
return true
}
return true
}
func(this SearchIndexer) Bookkeeping() bool {