mirror of
https://github.com/mickael-kerjean/filestash
synced 2025-12-06 08:22:24 +01:00
feature (search): content indexing: pdf, docx, txt, pptx, org
This commit is contained in:
parent
e88758567d
commit
4ba5a7c346
5 changed files with 166 additions and 1 deletions
|
|
@ -67,6 +67,7 @@
|
|||
"m3u8": "application/vnd.apple.mpegurl",
|
||||
"m4a": "audio/x-m4a",
|
||||
"m4v": "video/x-m4v",
|
||||
"md": "text/markdown",
|
||||
"mdc": "image/x-minolta-mdc",
|
||||
"mef": "image/x-mamiya-mef",
|
||||
"mid": "audio/midi",
|
||||
|
|
|
|||
82
server/model/formater/office.go
Normal file
82
server/model/formater/office.go
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
package formater
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"fmt"
|
||||
. "github.com/mickael-kerjean/filestash/server/common"
|
||||
"io"
|
||||
"math/rand"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func OfficeFormater(r io.ReadCloser) (io.ReadCloser, error) {
|
||||
tmpName := fmt.Sprintf("/tmp/docx_%d.docx", rand.Intn(1000000))
|
||||
defer os.Remove(tmpName)
|
||||
f, err := os.OpenFile(tmpName, os.O_CREATE | os.O_WRONLY, os.ModePerm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, err = io.Copy(f, r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
z, err := zip.OpenReader(tmpName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer z.Close()
|
||||
|
||||
hasData := false
|
||||
content := bytes.NewBuffer([]byte{})
|
||||
for _, f := range z.File {
|
||||
shouldExtract := false
|
||||
if f.Name == "word/document.xml" { shouldExtract = true }
|
||||
if strings.HasPrefix(f.Name, "ppt/slides/slide") { shouldExtract = true }
|
||||
|
||||
if shouldExtract == false {
|
||||
continue
|
||||
}
|
||||
hasData = true
|
||||
o, err := f.Open()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dec := xml.NewDecoder(o)
|
||||
for {
|
||||
t, err := dec.Token()
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
if t == nil {
|
||||
break
|
||||
}
|
||||
switch el := t.(type) {
|
||||
case xml.StartElement:
|
||||
if el.Name.Local == "t" {
|
||||
w := WordDoc{}
|
||||
dec.DecodeElement(&w, &el)
|
||||
if len(w.Text) > 0 {
|
||||
w.Text = regexp.MustCompile("\\s+\\.\\s+").ReplaceAll(w.Text, []byte(". "))
|
||||
w.Text = regexp.MustCompile("\\s{2,}").ReplaceAll(w.Text, []byte(" "))
|
||||
content.Write(w.Text)
|
||||
content.Write([]byte(" "))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hasData == false {
|
||||
return nil, ErrNotFound
|
||||
}
|
||||
return NewReadCloserFromReader(content), nil
|
||||
}
|
||||
|
||||
|
||||
type WordDoc struct {
|
||||
Text []byte `xml:",innerxml"`
|
||||
}
|
||||
35
server/model/formater/pdf.go
Normal file
35
server/model/formater/pdf.go
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
package formater
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
. "github.com/mickael-kerjean/filestash/server/common"
|
||||
"io"
|
||||
"math/rand"
|
||||
"os"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
func PdfFormater(r io.ReadCloser) (io.ReadCloser, error) {
|
||||
tmpName := fmt.Sprintf("/tmp/pdf_%d.docx", rand.Intn(1000000))
|
||||
defer os.Remove(tmpName)
|
||||
f, err := os.OpenFile(tmpName, os.O_CREATE | os.O_WRONLY, os.ModePerm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, err = io.Copy(f, r)
|
||||
if err != nil {
|
||||
f.Close()
|
||||
return nil, err
|
||||
}
|
||||
f.Close()
|
||||
|
||||
cmd := exec.Command("pdftotext", tmpName, "-")
|
||||
out := bytes.NewBuffer([]byte{})
|
||||
cmd.Stdout = out
|
||||
err = cmd.Run()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return NewReadCloserFromReader(out), nil
|
||||
}
|
||||
9
server/model/formater/txt.go
Normal file
9
server/model/formater/txt.go
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
package formater
|
||||
|
||||
import (
|
||||
"io"
|
||||
)
|
||||
|
||||
func TxtFormater(rc io.ReadCloser) (io.ReadCloser, error) {
|
||||
return rc, nil
|
||||
}
|
||||
|
|
@ -6,7 +6,9 @@ import (
|
|||
"encoding/base64"
|
||||
"github.com/mattn/go-sqlite3"
|
||||
. "github.com/mickael-kerjean/filestash/server/common"
|
||||
"github.com/mickael-kerjean/filestash/server/model/formater"
|
||||
"hash/fnv"
|
||||
"io"
|
||||
"math/rand"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
|
|
@ -485,7 +487,43 @@ func(this *SearchIndexer) Discover() bool {
|
|||
}
|
||||
|
||||
func(this *SearchIndexer) Indexing() bool {
|
||||
var path string
|
||||
err := this.db.QueryRow(
|
||||
"SELECT path FROM file WHERE (" +
|
||||
" type = 'file' AND size < 512000 AND filetype = 'txt' AND indexTime IS NULL" +
|
||||
") LIMIT 1;",
|
||||
).Scan(&path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer this.db.Exec(
|
||||
"UPDATE file SET indexTime = ? WHERE path = ?",
|
||||
time.Now(), path,
|
||||
)
|
||||
|
||||
mime := GetMimeType(path)
|
||||
var reader io.ReadCloser
|
||||
reader, err = this.Backend.Cat(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer reader.Close()
|
||||
switch mime {
|
||||
case "text/plain": reader, err = formater.TxtFormater(reader)
|
||||
case "text/org": reader, err = formater.TxtFormater(reader)
|
||||
case "text/markdown": reader, err = formater.TxtFormater(reader)
|
||||
case "application/pdf": reader, err = formater.PdfFormater(reader)
|
||||
case "application/powerpoint": reader, err = formater.OfficeFormater(reader)
|
||||
case "application/vnd.ms-powerpoint": reader, err = formater.OfficeFormater(reader)
|
||||
case "application/word": reader, err = formater.OfficeFormater(reader)
|
||||
case "application/msword": reader, err = formater.OfficeFormater(reader)
|
||||
default: return true
|
||||
}
|
||||
if err != nil {
|
||||
Log.Warning("search::indexing formater_error (%v)", err)
|
||||
return true
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func(this SearchIndexer) Bookkeeping() bool {
|
||||
|
|
|
|||
Loading…
Reference in a new issue