suffixarray.go

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package suffixarray implements substring search in logarithmic time using
     6  // an in-memory suffix array.
     7  //
     8  // Example use:
     9  //
    10  //	// create index for some data
    11  //	index := suffixarray.New(data)
    12  //
    13  //	// lookup byte slice s
    14  //	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
    15  //	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
    16  package suffixarray
    17  
    18  import (
    19  	"bytes"
    20  	"encoding/binary"
    21  	"errors"
    22  	"io"
    23  	"math"
    24  	"regexp"
    25  	"sort"
    26  )
    27  
    28  // Can change for testing
    29  var maxData32 int = realMaxData32
    30  
    31  const realMaxData32 = math.MaxInt32
    32  
    33  // Index implements a suffix array for fast substring search.
    34  type Index struct {
    35  	data []byte
    36  	sa   ints // suffix array for data; sa.len() == len(data)
    37  }
    38  
    39  // An ints is either an []int32 or an []int64.
    40  // That is, one of them is empty, and one is the real data.
    41  // The int64 form is used when len(data) > maxData32
    42  type ints struct {
    43  	int32 []int32
    44  	int64 []int64
    45  }
    46  
    47  func (a *ints) len() int {
    48  	return len(a.int32) + len(a.int64)
    49  }
    50  
    51  func (a *ints) get(i int) int64 {
    52  	if a.int32 != nil {
    53  		return int64(a.int32[i])
    54  	}
    55  	return a.int64[i]
    56  }
    57  
    58  func (a *ints) set(i int, v int64) {
    59  	if a.int32 != nil {
    60  		a.int32[i] = int32(v)
    61  	} else {
    62  		a.int64[i] = v
    63  	}
    64  }
    65  
    66  func (a *ints) slice(i, j int) ints {
    67  	if a.int32 != nil {
    68  		return ints{a.int32[i:j], nil}
    69  	}
    70  	return ints{nil, a.int64[i:j]}
    71  }
    72  
    73  // New creates a new [Index] for data.
    74  // [Index] creation time is O(N) for N = len(data).
    75  func New(data []byte) *Index {
    76  	ix := &Index{data: data}
    77  	if len(data) <= maxData32 {
    78  		ix.sa.int32 = make([]int32, len(data))
    79  		text_32(data, ix.sa.int32)
    80  	} else {
    81  		ix.sa.int64 = make([]int64, len(data))
    82  		text_64(data, ix.sa.int64)
    83  	}
    84  	return ix
    85  }
    86  
    87  // writeInt writes an int x to w using buf to buffer the write.
    88  func writeInt(w io.Writer, buf []byte, x int) error {
    89  	binary.PutVarint(buf, int64(x))
    90  	_, err := w.Write(buf[0:binary.MaxVarintLen64])
    91  	return err
    92  }
    93  
    94  // readInt reads an int x from r using buf to buffer the read and returns x.
    95  func readInt(r io.Reader, buf []byte) (int64, error) {
    96  	_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
    97  	x, _ := binary.Varint(buf)
    98  	return x, err
    99  }
   100  
   101  // writeSlice writes data[:n] to w and returns n.
   102  // It uses buf to buffer the write.
   103  func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) {
   104  	// encode as many elements as fit into buf
   105  	p := binary.MaxVarintLen64
   106  	m := data.len()
   107  	for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ {
   108  		p += binary.PutUvarint(buf[p:], uint64(data.get(n)))
   109  	}
   110  
   111  	// update buffer size
   112  	binary.PutVarint(buf, int64(p))
   113  
   114  	// write buffer
   115  	_, err = w.Write(buf[0:p])
   116  	return
   117  }
   118  
   119  var errTooBig = errors.New("suffixarray: data too large")
   120  
   121  // readSlice reads data[:n] from r and returns n.
   122  // It uses buf to buffer the read.
   123  func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) {
   124  	// read buffer size
   125  	var size64 int64
   126  	size64, err = readInt(r, buf)
   127  	if err != nil {
   128  		return
   129  	}
   130  	if int64(int(size64)) != size64 || int(size64) < 0 {
   131  		// We never write chunks this big anyway.
   132  		return 0, errTooBig
   133  	}
   134  	size := int(size64)
   135  
   136  	// read buffer w/o the size
   137  	if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
   138  		return
   139  	}
   140  
   141  	// decode as many elements as present in buf
   142  	for p := binary.MaxVarintLen64; p < size; n++ {
   143  		x, w := binary.Uvarint(buf[p:])
   144  		data.set(n, int64(x))
   145  		p += w
   146  	}
   147  
   148  	return
   149  }
   150  
   151  const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
   152  
   153  // Read reads the index from r into x; x must not be nil.
   154  func (x *Index) Read(r io.Reader) error {
   155  	// buffer for all reads
   156  	buf := make([]byte, bufSize)
   157  
   158  	// read length
   159  	n64, err := readInt(r, buf)
   160  	if err != nil {
   161  		return err
   162  	}
   163  	if int64(int(n64)) != n64 || int(n64) < 0 {
   164  		return errTooBig
   165  	}
   166  	n := int(n64)
   167  
   168  	// allocate space
   169  	if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 {
   170  		// new data is significantly smaller or larger than
   171  		// existing buffers - allocate new ones
   172  		x.data = make([]byte, n)
   173  		x.sa.int32 = nil
   174  		x.sa.int64 = nil
   175  		if n <= maxData32 {
   176  			x.sa.int32 = make([]int32, n)
   177  		} else {
   178  			x.sa.int64 = make([]int64, n)
   179  		}
   180  	} else {
   181  		// re-use existing buffers
   182  		x.data = x.data[0:n]
   183  		x.sa = x.sa.slice(0, n)
   184  	}
   185  
   186  	// read data
   187  	if _, err := io.ReadFull(r, x.data); err != nil {
   188  		return err
   189  	}
   190  
   191  	// read index
   192  	sa := x.sa
   193  	for sa.len() > 0 {
   194  		n, err := readSlice(r, buf, sa)
   195  		if err != nil {
   196  			return err
   197  		}
   198  		sa = sa.slice(n, sa.len())
   199  	}
   200  	return nil
   201  }
   202  
   203  // Write writes the index x to w.
   204  func (x *Index) Write(w io.Writer) error {
   205  	// buffer for all writes
   206  	buf := make([]byte, bufSize)
   207  
   208  	// write length
   209  	if err := writeInt(w, buf, len(x.data)); err != nil {
   210  		return err
   211  	}
   212  
   213  	// write data
   214  	if _, err := w.Write(x.data); err != nil {
   215  		return err
   216  	}
   217  
   218  	// write index
   219  	sa := x.sa
   220  	for sa.len() > 0 {
   221  		n, err := writeSlice(w, buf, sa)
   222  		if err != nil {
   223  			return err
   224  		}
   225  		sa = sa.slice(n, sa.len())
   226  	}
   227  	return nil
   228  }
   229  
   230  // Bytes returns the data over which the index was created.
   231  // It must not be modified.
   232  func (x *Index) Bytes() []byte {
   233  	return x.data
   234  }
   235  
   236  func (x *Index) at(i int) []byte {
   237  	return x.data[x.sa.get(i):]
   238  }
   239  
   240  // lookupAll returns a slice into the matching region of the index.
   241  // The runtime is O(log(N)*len(s)).
   242  func (x *Index) lookupAll(s []byte) ints {
   243  	// find matching suffix index range [i:j]
   244  	// find the first index where s would be the prefix
   245  	i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
   246  	// starting at i, find the first index at which s is not a prefix
   247  	j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
   248  	return x.sa.slice(i, j)
   249  }
   250  
   251  // Lookup returns an unsorted list of at most n indices where the byte string s
   252  // occurs in the indexed data. If n < 0, all occurrences are returned.
   253  // The result is nil if s is empty, s is not found, or n == 0.
   254  // Lookup time is O(log(N)*len(s) + len(result)) where N is the
   255  // size of the indexed data.
   256  func (x *Index) Lookup(s []byte, n int) (result []int) {
   257  	if len(s) > 0 && n != 0 {
   258  		matches := x.lookupAll(s)
   259  		count := matches.len()
   260  		if n < 0 || count < n {
   261  			n = count
   262  		}
   263  		// 0 <= n <= count
   264  		if n > 0 {
   265  			result = make([]int, n)
   266  			if matches.int32 != nil {
   267  				for i := range result {
   268  					result[i] = int(matches.int32[i])
   269  				}
   270  			} else {
   271  				for i := range result {
   272  					result[i] = int(matches.int64[i])
   273  				}
   274  			}
   275  		}
   276  	}
   277  	return
   278  }
   279  
   280  // FindAllIndex returns a sorted list of non-overlapping matches of the
   281  // regular expression r, where a match is a pair of indices specifying
   282  // the matched slice of x.Bytes(). If n < 0, all matches are returned
   283  // in successive order. Otherwise, at most n matches are returned and
   284  // they may not be successive. The result is nil if there are no matches,
   285  // or if n == 0.
   286  func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
   287  	// a non-empty literal prefix is used to determine possible
   288  	// match start indices with Lookup
   289  	prefix, complete := r.LiteralPrefix()
   290  	lit := []byte(prefix)
   291  
   292  	// worst-case scenario: no literal prefix
   293  	if prefix == "" {
   294  		return r.FindAllIndex(x.data, n)
   295  	}
   296  
   297  	// if regexp is a literal just use Lookup and convert its
   298  	// result into match pairs
   299  	if complete {
   300  		// Lookup returns indices that may belong to overlapping matches.
   301  		// After eliminating them, we may end up with fewer than n matches.
   302  		// If we don't have enough at the end, redo the search with an
   303  		// increased value n1, but only if Lookup returned all the requested
   304  		// indices in the first place (if it returned fewer than that then
   305  		// there cannot be more).
   306  		for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
   307  			indices := x.Lookup(lit, n1)
   308  			if len(indices) == 0 {
   309  				return
   310  			}
   311  			sort.Ints(indices)
   312  			pairs := make([]int, 2*len(indices))
   313  			result = make([][]int, len(indices))
   314  			count := 0
   315  			prev := 0
   316  			for _, i := range indices {
   317  				if count == n {
   318  					break
   319  				}
   320  				// ignore indices leading to overlapping matches
   321  				if prev <= i {
   322  					j := 2 * count
   323  					pairs[j+0] = i
   324  					pairs[j+1] = i + len(lit)
   325  					result[count] = pairs[j : j+2]
   326  					count++
   327  					prev = i + len(lit)
   328  				}
   329  			}
   330  			result = result[0:count]
   331  			if len(result) >= n || len(indices) != n1 {
   332  				// found all matches or there's no chance to find more
   333  				// (n and n1 can be negative)
   334  				break
   335  			}
   336  		}
   337  		if len(result) == 0 {
   338  			result = nil
   339  		}
   340  		return
   341  	}
   342  
   343  	// regexp has a non-empty literal prefix; Lookup(lit) computes
   344  	// the indices of possible complete matches; use these as starting
   345  	// points for anchored searches
   346  	// (regexp "^" matches beginning of input, not beginning of line)
   347  	r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
   348  
   349  	// same comment about Lookup applies here as in the loop above
   350  	for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
   351  		indices := x.Lookup(lit, n1)
   352  		if len(indices) == 0 {
   353  			return
   354  		}
   355  		sort.Ints(indices)
   356  		result = result[0:0]
   357  		prev := 0
   358  		for _, i := range indices {
   359  			if len(result) == n {
   360  				break
   361  			}
   362  			m := r.FindIndex(x.data[i:]) // anchored search - will not run off
   363  			// ignore indices leading to overlapping matches
   364  			if m != nil && prev <= i {
   365  				m[0] = i // correct m
   366  				m[1] += i
   367  				result = append(result, m)
   368  				prev = m[1]
   369  			}
   370  		}
   371  		if len(result) >= n || len(indices) != n1 {
   372  			// found all matches or there's no chance to find more
   373  			// (n and n1 can be negative)
   374  			break
   375  		}
   376  	}
   377  	if len(result) == 0 {
   378  		result = nil
   379  	}
   380  	return
   381  }
   382
View as plain text