// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version // 2.0 (the "License"); you may not use this file except in compliance with the // License. You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. package goavro import ( "bytes" "compress/flate" "encoding/binary" "errors" "fmt" "hash/crc32" "io" "io/ioutil" "os" "github.com/golang/snappy" ) // OCFConfig is used to specify creation parameters for OCFWriter. type OCFConfig struct { // W specifies the `io.Writer` to which to send the encoded data, // (required). If W is `*os.File`, then creating an OCF for writing will // attempt to read any existing OCF header and use the schema and // compression codec specified by the existing header, then advance the file // position to the tail end of the file for appending. W io.Writer // Codec specifies the Codec to use for the new OCFWriter, (optional). If // the W parameter above is an `*os.File` which contains a Codec, the Codec // in the existing file will be used instead. Otherwise if this Codec // parameter is specified, it will be used. If neither the W parameter above // is an `*os.File` with an existing Codec, nor this Codec parameter is // specified, the OCFWriter will create a new Codec from the schema string // specified by the Schema parameter below. Codec *Codec // Schema specifies the Avro schema for the data to be encoded, (optional). // If neither the W parameter above is an `*os.File` with an existing Codec, // nor the Codec parameter above is specified, the OCFWriter will create a // new Codec from the schema string specified by this Schema parameter. Schema string // CompressionName specifies the compression codec used, (optional). If // omitted, defaults to "null" codec. When appending to an existing OCF, // this field is ignored. CompressionName string //MetaData specifies application specific meta data to be added to //the OCF file. When appending to an existing OCF, this field //is ignored MetaData map[string][]byte } // OCFWriter is used to create a new or append to an existing Avro Object // Container File (OCF). type OCFWriter struct { header *ocfHeader iow io.Writer } // NewOCFWriter returns a new OCFWriter instance that may be used for appending // binary Avro data, either by appending to an existing OCF file or creating a // new OCF file. func NewOCFWriter(config OCFConfig) (*OCFWriter, error) { var err error ocf := &OCFWriter{iow: config.W} switch config.W.(type) { case nil: return nil, errors.New("cannot create OCFWriter when W is nil") case *os.File: file := config.W.(*os.File) stat, err := file.Stat() if err != nil { return nil, fmt.Errorf("cannot create OCFWriter: %s", err) } // NOTE: When upstream provides a new file, it will already exist but // have a size of 0 bytes. if stat.Size() > 0 { // attempt to read existing OCF header if ocf.header, err = readOCFHeader(file); err != nil { return nil, fmt.Errorf("cannot create OCFWriter: %s", err) } // prepare for appending data to existing OCF if err = ocf.quickScanToTail(file); err != nil { return nil, fmt.Errorf("cannot create OCFWriter: %s", err) } return ocf, nil // happy case for appending to existing OCF } } // create new OCF header based on configuration parameters if ocf.header, err = newOCFHeader(config); err != nil { return nil, fmt.Errorf("cannot create OCFWriter: %s", err) } if err = writeOCFHeader(ocf.header, config.W); err != nil { return nil, fmt.Errorf("cannot create OCFWriter: %s", err) } return ocf, nil // another happy case for creation of new OCF } // quickScanToTail advances the stream reader to the tail end of the // file. Rather than reading each encoded block, optionally decompressing it, // and then decoding it, this method reads the block count, ignoring it, then // reads the block size, then skips ahead to the followig block. It does this // repeatedly until attempts to read the file return io.EOF. func (ocfw *OCFWriter) quickScanToTail(ior io.Reader) error { sync := make([]byte, ocfSyncLength) for { // Read and validate block count blockCount, err := longBinaryReader(ior) if err != nil { if err == io.EOF { return nil // merely end of file, rather than error } return fmt.Errorf("cannot read block count: %s", err) } if blockCount <= 0 { return fmt.Errorf("cannot read when block count is not greater than 0: %d", blockCount) } if blockCount > MaxBlockCount { return fmt.Errorf("cannot read when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) } // Read block size blockSize, err := longBinaryReader(ior) if err != nil { return fmt.Errorf("cannot read block size: %s", err) } if blockSize <= 0 { return fmt.Errorf("cannot read when block size is not greater than 0: %d", blockSize) } if blockSize > MaxBlockSize { return fmt.Errorf("cannot read when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize) } // Advance reader to end of block if _, err = io.CopyN(ioutil.Discard, ior, blockSize); err != nil { return fmt.Errorf("cannot seek to next block: %s", err) } // Read and validate sync marker var n int if n, err = io.ReadFull(ior, sync); err != nil { return fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, err) } if !bytes.Equal(sync, ocfw.header.syncMarker[:]) { return fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfw.header.syncMarker) } } } // Append appends one or more data items to an OCF file in a block. If there are // more data items in the slice than MaxBlockCount allows, the data slice will // be chunked into multiple blocks, each not having more than MaxBlockCount // items. func (ocfw *OCFWriter) Append(data interface{}) error { arrayValues, err := convertArray(data) if err != nil { return err } // Chunk data so no block has more than MaxBlockCount items. for int64(len(arrayValues)) > MaxBlockCount { if err := ocfw.appendDataIntoBlock(arrayValues[:MaxBlockCount]); err != nil { return err } arrayValues = arrayValues[MaxBlockCount:] } return ocfw.appendDataIntoBlock(arrayValues) } func (ocfw *OCFWriter) appendDataIntoBlock(data []interface{}) error { var block []byte // working buffer for encoding data values var err error // Encode and concatenate each data item into the block for _, datum := range data { if block, err = ocfw.header.codec.BinaryFromNative(block, datum); err != nil { return fmt.Errorf("cannot translate datum to binary: %v; %s", datum, err) } } switch ocfw.header.compressionID { case compressionNull: // no-op case compressionDeflate: // compress into new bytes buffer. bb := bytes.NewBuffer(make([]byte, 0, len(block))) cw, _ := flate.NewWriter(bb, flate.DefaultCompression) // writing bytes to cw will compress bytes and send to bb. if _, err := cw.Write(block); err != nil { return err } if err := cw.Close(); err != nil { return err } block = bb.Bytes() case compressionSnappy: compressed := snappy.Encode(nil, block) // OCF requires snappy to have CRC32 checksum after each snappy block compressed = append(compressed, 0, 0, 0, 0) // expand slice by 4 bytes so checksum will fit binary.BigEndian.PutUint32(compressed[len(compressed)-4:], crc32.ChecksumIEEE(block)) // checksum of decompressed block block = compressed default: return fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfw.header.compressionID) } // create file data block buf := make([]byte, 0, len(block)+ocfBlockConst) // pre-allocate block bytes buf, _ = longBinaryFromNative(buf, len(data)) // block count (number of data items) buf, _ = longBinaryFromNative(buf, len(block)) // block size (number of bytes in block) buf = append(buf, block...) // serialized objects buf = append(buf, ocfw.header.syncMarker[:]...) // sync marker _, err = ocfw.iow.Write(buf) return err } // Codec returns the codec used by OCFWriter. This function provided because // upstream may be appending to existing OCF which uses a different schema than // requested during instantiation. func (ocfw *OCFWriter) Codec() *Codec { return ocfw.header.codec } // CompressionName returns the name of the compression algorithm used by // OCFWriter. This function provided because upstream may be appending to // existing OCF which uses a different compression algorithm than requested // during instantiation. the OCF file. func (ocfw *OCFWriter) CompressionName() string { switch ocfw.header.compressionID { case compressionNull: return CompressionNullLabel case compressionDeflate: return CompressionDeflateLabel case compressionSnappy: return CompressionSnappyLabel default: return "should not get here: unrecognized compression algorithm" } }