diff options
Diffstat (limited to 'parser/pageparser/pageparser.go')
-rw-r--r-- | parser/pageparser/pageparser.go | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/parser/pageparser/pageparser.go b/parser/pageparser/pageparser.go new file mode 100644 index 000000000..f73eee706 --- /dev/null +++ b/parser/pageparser/pageparser.go @@ -0,0 +1,195 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo. +// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go" +// It's on YouTube, Google it!. +// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html +package pageparser + +import ( + "bytes" + "io" + "io/ioutil" + + "github.com/gohugoio/hugo/parser/metadecoders" + "github.com/pkg/errors" +) + +// Result holds the parse result. +type Result interface { + // Iterator returns a new Iterator positioned at the beginning of the parse tree. + Iterator() *Iterator + // Input returns the input to Parse. + Input() []byte +} + +var _ Result = (*pageLexer)(nil) + +// Parse parses the page in the given reader according to the given Config. +// TODO(bep) now that we have improved the "lazy order" init, it *may* be +// some potential saving in doing a buffered approach where the first pass does +// the frontmatter only. +func Parse(r io.Reader, cfg Config) (Result, error) { + return parseSection(r, cfg, lexIntroSection) +} + +type ContentFrontMatter struct { + Content []byte + FrontMatter map[string]interface{} + FrontMatterFormat metadecoders.Format +} + +// ParseFrontMatterAndContent is a convenience method to extract front matter +// and content from a content page. +func ParseFrontMatterAndContent(r io.Reader) (ContentFrontMatter, error) { + var cf ContentFrontMatter + + psr, err := Parse(r, Config{}) + if err != nil { + return cf, err + } + + var frontMatterSource []byte + + iter := psr.Iterator() + + walkFn := func(item Item) bool { + if frontMatterSource != nil { + // The rest is content. + cf.Content = psr.Input()[item.Pos:] + // Done + return false + } else if item.IsFrontMatter() { + cf.FrontMatterFormat = FormatFromFrontMatterType(item.Type) + frontMatterSource = item.Val + } + return true + + } + + iter.PeekWalk(walkFn) + + cf.FrontMatter, err = metadecoders.Default.UnmarshalToMap(frontMatterSource, cf.FrontMatterFormat) + return cf, err +} + +func FormatFromFrontMatterType(typ ItemType) metadecoders.Format { + switch typ { + case TypeFrontMatterJSON: + return metadecoders.JSON + case TypeFrontMatterORG: + return metadecoders.ORG + case TypeFrontMatterTOML: + return metadecoders.TOML + case TypeFrontMatterYAML: + return metadecoders.YAML + default: + return "" + } +} + +// ParseMain parses starting with the main section. Used in tests. +func ParseMain(r io.Reader, cfg Config) (Result, error) { + return parseSection(r, cfg, lexMainSection) +} + +func parseSection(r io.Reader, cfg Config, start stateFunc) (Result, error) { + b, err := ioutil.ReadAll(r) + if err != nil { + return nil, errors.Wrap(err, "failed to read page content") + } + return parseBytes(b, cfg, start) +} + +func parseBytes(b []byte, cfg Config, start stateFunc) (Result, error) { + lexer := newPageLexer(b, start, cfg) + lexer.run() + return lexer, nil +} + +// An Iterator has methods to iterate a parsed page with support going back +// if needed. +type Iterator struct { + l *pageLexer + lastPos int // position of the last item returned by nextItem +} + +// consumes and returns the next item +func (t *Iterator) Next() Item { + t.lastPos++ + return t.Current() +} + +// Input returns the input source. +func (t *Iterator) Input() []byte { + return t.l.Input() +} + +var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens"), true} + +// Current will repeatably return the current item. +func (t *Iterator) Current() Item { + if t.lastPos >= len(t.l.items) { + return errIndexOutOfBounds + } + return t.l.items[t.lastPos] +} + +// backs up one token. +func (t *Iterator) Backup() { + if t.lastPos < 0 { + panic("need to go forward before going back") + } + t.lastPos-- +} + +// check for non-error and non-EOF types coming next +func (t *Iterator) IsValueNext() bool { + i := t.Peek() + return i.Type != tError && i.Type != tEOF +} + +// look at, but do not consume, the next item +// repeated, sequential calls will return the same item +func (t *Iterator) Peek() Item { + return t.l.items[t.lastPos+1] +} + +// PeekWalk will feed the next items in the iterator to walkFn +// until it returns false. +func (t *Iterator) PeekWalk(walkFn func(item Item) bool) { + for i := t.lastPos + 1; i < len(t.l.items); i++ { + item := t.l.items[i] + if !walkFn(item) { + break + } + } +} + +// Consume is a convencience method to consume the next n tokens, +// but back off Errors and EOF. +func (t *Iterator) Consume(cnt int) { + for i := 0; i < cnt; i++ { + token := t.Next() + if token.Type == tError || token.Type == tEOF { + t.Backup() + break + } + } +} + +// LineNumber returns the current line number. Used for logging. +func (t *Iterator) LineNumber() int { + return bytes.Count(t.l.input[:t.Current().Pos], lf) + 1 +} |