/******************************************************************************* * Copyright 2010 Cees De Groot, Alex Boisvert, Jan Kotek * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.apache.jdbm; import java.io.*; import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; /** * B+Tree persistent indexing data structure. B+Trees are optimized for * block-based, random I/O storage because they store multiple keys on * one tree node (called BTreeNode). In addition, the leaf nodes * directly contain (inline) small values associated with the keys, allowing a * single (or sequential) disk read of all the values on the node. *

* B+Trees are n-airy, yeilding log(N) search cost. They are self-balancing, * preventing search performance degradation when the size of the tree grows. *

* BTree stores its keys sorted. By default JDBM expects key to implement * Comparable interface but user may supply its own Comparator * at BTree creation time. Comparator is serialized and stored as part of BTree. *

* The B+Tree allows traversing the keys in forward and reverse order using a * TupleBrowser obtained from the browse() methods. But it is better to use * BTreeMap wrapper which implements SortedMap interface *

* This implementation does not directly support duplicate keys. It is * possible to handle duplicates by grouping values using an ArrayList as value. * This scenario is supported by JDBM serialization so there is no big performance penalty. *

* There is no limit on key size or value size, but it is recommended to keep * keys as small as possible to reduce disk I/O. If serialized value exceeds 32 bytes, * it is stored in separate record and tree contains only recid reference to it. * BTree uses delta compression for its keys. * * * @author Alex Boisvert * @author Jan Kotek */ class BTree { private static final boolean DEBUG = false; /** * Default node size (number of entries per node) */ public static final int DEFAULT_SIZE = 32; //TODO test optimal size, it has serious impact on sequencial write and read /** * Record manager used to persist changes in BTreeNodes */ protected transient DBAbstract _db; /** * This BTree's record ID in the DB. */ private transient long _recid; /** * Comparator used to index entries (optional) */ protected Comparator _comparator; /** * Serializer used to serialize index keys (optional) */ protected Serializer keySerializer; /** * Serializer used to serialize index values (optional) */ protected Serializer valueSerializer; /** * indicates if values should be loaded during deserialization, set to false during defragmentation */ boolean loadValues = true; /** if false map contains only keys, used for set*/ boolean hasValues = true; /** * The number of structural modifications to the tree for fail fast iterators. This value is just for runtime, it is not persisted */ transient int modCount = 0; /** * cached instance of an insert result, so we do not have to allocate new object on each insert */ protected BTreeNode.InsertResult insertResultReuse; //TODO investigate performance impact of removing this public Serializer getKeySerializer() { return keySerializer; } public Serializer getValueSerializer() { return valueSerializer; } /** * Height of the B+Tree. This is the number of BTreeNodes you have to traverse * to get to a leaf BTreeNode, starting from the root. */ private int _height; /** * Recid of the root BTreeNode */ private transient long _root; /** * Total number of entries in the BTree */ protected volatile long _entries; /** * Serializer used for BTreeNodes of this tree */ private transient BTreeNode _nodeSerializer = new BTreeNode(); { _nodeSerializer._btree = this; } /** * Listeners which are notified about changes in records */ protected RecordListener[] recordListeners = new RecordListener[0]; final protected ReadWriteLock lock = new ReentrantReadWriteLock(); /** * No-argument constructor used by serialization. */ public BTree() { // empty } /** * Create a new persistent BTree */ @SuppressWarnings("unchecked") public static BTree createInstance(DBAbstract db) throws IOException { return createInstance(db, null, null, null,true); } /** * Create a new persistent BTree */ public static BTree createInstance(DBAbstract db, Comparator comparator, Serializer keySerializer, Serializer valueSerializer, boolean hasValues) throws IOException { BTree btree; if (db == null) { throw new IllegalArgumentException("Argument 'db' is null"); } btree = new BTree(); btree._db = db; btree._comparator = comparator; btree.keySerializer = keySerializer; btree.valueSerializer = valueSerializer; btree.hasValues = hasValues; btree._recid = db.insert(btree, btree.getRecordManager().defaultSerializer(),false); return btree; } /** * Load a persistent BTree. * * @param db DB used to store the persistent btree * @param recid Record id of the BTree */ @SuppressWarnings("unchecked") public static BTree load(DBAbstract db, long recid) throws IOException { BTree btree = (BTree) db.fetch(recid); btree._recid = recid; btree._db = db; btree._nodeSerializer = new BTreeNode(); btree._nodeSerializer._btree = btree; return btree; } /** * Get the {@link ReadWriteLock} associated with this BTree. * This should be used with browsing operations to ensure * consistency. * * @return */ public ReadWriteLock getLock() { return lock; } /** * Insert an entry in the BTree. *

* The BTree cannot store duplicate entries. An existing entry can be * replaced using the replace flag. If an entry with the * same key already exists in the BTree, its value is returned. * * @param key Insert key * @param value Insert value * @param replace Set to true to replace an existing key-value pair. * @return Existing value, if any. */ public V insert(final K key, final V value, final boolean replace) throws IOException { if (key == null) { throw new IllegalArgumentException("Argument 'key' is null"); } if (value == null) { throw new IllegalArgumentException("Argument 'value' is null"); } try { lock.writeLock().lock(); BTreeNode rootNode = getRoot(); if (rootNode == null) { // BTree is currently empty, create a new root BTreeNode if (DEBUG) { System.out.println("BTree.insert() new root BTreeNode"); } rootNode = new BTreeNode(this, key, value); _root = rootNode._recid; _height = 1; _entries = 1; _db.update(_recid, this); modCount++; //notifi listeners for (RecordListener l : recordListeners) { l.recordInserted(key, value); } return null; } else { BTreeNode.InsertResult insert = rootNode.insert(_height, key, value, replace); boolean dirty = false; if (insert._overflow != null) { // current root node overflowed, we replace with a new root node if (DEBUG) { System.out.println("BTreeNode.insert() replace root BTreeNode due to overflow"); } rootNode = new BTreeNode(this, rootNode, insert._overflow); _root = rootNode._recid; _height += 1; dirty = true; } if (insert._existing == null) { _entries++; modCount++; dirty = true; } if (dirty) { _db.update(_recid, this); } //notify listeners for (RecordListener l : recordListeners) { if (insert._existing == null) l.recordInserted(key, value); else l.recordUpdated(key, insert._existing, value); } // insert might have returned an existing value V ret = insert._existing; //zero out tuple and put it for reuse insert._existing = null; insert._overflow = null; this.insertResultReuse = insert; return ret; } } finally { lock.writeLock().unlock(); } } /** * Remove an entry with the given key from the BTree. * * @param key Removal key * @return Value associated with the key, or null if no entry with given * key existed in the BTree. */ public V remove(K key) throws IOException { if (key == null) { throw new IllegalArgumentException("Argument 'key' is null"); } try { lock.writeLock().lock(); BTreeNode rootNode = getRoot(); if (rootNode == null) { return null; } boolean dirty = false; BTreeNode.RemoveResult remove = rootNode.remove(_height, key); if (remove._underflow && rootNode.isEmpty()) { _height -= 1; dirty = true; _db.delete(_root); if (_height == 0) { _root = 0; } else { _root = rootNode.loadLastChildNode()._recid; } } if (remove._value != null) { _entries--; modCount++; dirty = true; } if (dirty) { _db.update(_recid, this); } if (remove._value != null) for (RecordListener l : recordListeners) l.recordRemoved(key, remove._value); return remove._value; } finally { lock.writeLock().unlock(); } } /** * Find the value associated with the given key. * * @param key Lookup key. * @return Value associated with the key, or null if not found. */ public V get(K key) throws IOException { if (key == null) { throw new IllegalArgumentException("Argument 'key' is null"); } try { lock.readLock().lock(); BTreeNode rootNode = getRoot(); if (rootNode == null) { return null; } return rootNode.findValue(_height, key); } finally { lock.readLock().unlock(); } } /** * Find the value associated with the given key, or the entry immediately * following this key in the ordered BTree. * * @param key Lookup key. * @return Value associated with the key, or a greater entry, or null if no * greater entry was found. */ public BTreeTuple findGreaterOrEqual(K key) throws IOException { BTreeTuple tuple; BTreeTupleBrowser browser; if (key == null) { // there can't be a key greater than or equal to "null" // because null is considered an infinite key. return null; } tuple = new BTreeTuple(null, null); browser = browse(key,true); if (browser.getNext(tuple)) { return tuple; } else { return null; } } /** * Get a browser initially positioned at the beginning of the BTree. *

* WARNING: If you make structural modifications to the BTree during * browsing, you will get inconsistent browing results. * * * @return Browser positionned at the beginning of the BTree. */ @SuppressWarnings("unchecked") public BTreeTupleBrowser browse() throws IOException { try { lock.readLock().lock(); BTreeNode rootNode = getRoot(); if (rootNode == null) { return EMPTY_BROWSER; } return rootNode.findFirst(); } finally { lock.readLock().unlock(); } } /** * Get a browser initially positioned just before the given key. *

* WARNING: �If you make structural modifications to the BTree during * browsing, you will get inconsistent browing results. * * * @param key Key used to position the browser. If null, the browser * will be positionned after the last entry of the BTree. * (Null is considered to be an "infinite" key) * @return Browser positionned just before the given key. */ @SuppressWarnings("unchecked") public BTreeTupleBrowser browse(final K key, final boolean inclusive) throws IOException { try { lock.readLock().lock(); BTreeNode rootNode = getRoot(); if (rootNode == null) { return EMPTY_BROWSER; } BTreeTupleBrowser browser = rootNode.find(_height, key, inclusive); return browser; } finally { lock.readLock().unlock(); } } /** * Return the persistent record identifier of the BTree. */ public long getRecid() { return _recid; } /** * Return the root BTreeNode, or null if it doesn't exist. */ BTreeNode getRoot() throws IOException { if (_root == 0) { return null; } BTreeNode root = _db.fetch(_root, _nodeSerializer); if (root != null) { root._recid = _root; root._btree = this; } return root; } static BTree readExternal(DataInput in, Serialization ser) throws IOException, ClassNotFoundException { BTree tree = new BTree(); tree._db = ser.db; tree._height = in.readInt(); tree._recid = in.readLong(); tree._root = in.readLong(); tree._entries = in.readLong(); tree.hasValues = in.readBoolean(); tree._comparator = (Comparator) ser.deserialize(in); tree.keySerializer = (Serializer) ser.deserialize(in); tree.valueSerializer = (Serializer) ser.deserialize(in); return tree; } public void writeExternal(DataOutput out) throws IOException { out.writeInt(_height); out.writeLong(_recid); out.writeLong(_root); out.writeLong(_entries); out.writeBoolean(hasValues); _db.defaultSerializer().serialize(out, _comparator); _db.defaultSerializer().serialize(out, keySerializer); _db.defaultSerializer().serialize(out, valueSerializer); } /** * Copyes tree from one db to other, defragmenting it allong the way * @param recid * @param r1 * @param r2 * @throws IOException */ public static void defrag(long recid, DBStore r1, DBStore r2) throws IOException { try { byte[] data = r1.fetchRaw(recid); r2.forceInsert(recid, data); DataInput in = new DataInputOutput(data); BTree t = (BTree) r1.defaultSerializer().deserialize(in); t.loadValues = false; t._db = r1; t._nodeSerializer = new BTreeNode(t, false); BTreeNode p = t.getRoot(); if (p != null) { r2.forceInsert(t._root, r1.fetchRaw(t._root)); p.defrag(r1, r2); } } catch (ClassNotFoundException e) { throw new IOError(e); } } /** * Browser returning no element. */ private static final BTreeTupleBrowser EMPTY_BROWSER = new BTreeTupleBrowser() { public boolean getNext(BTreeTuple tuple) { return false; } public boolean getPrevious(BTreeTuple tuple) { return false; } public void remove(Object key) { throw new IndexOutOfBoundsException(); } }; /** * add RecordListener which is notified about record changes * * @param listener */ public void addRecordListener(RecordListener listener) { recordListeners = Arrays.copyOf(recordListeners, recordListeners.length + 1); recordListeners[recordListeners.length - 1] = listener; } /** * remove RecordListener which is notified about record changes * * @param listener */ public void removeRecordListener(RecordListener listener) { List l = Arrays.asList(recordListeners); l.remove(listener); recordListeners = (RecordListener[]) l.toArray(new RecordListener[1]); } public DBAbstract getRecordManager() { return _db; } public Comparator getComparator() { return _comparator; } /** * Deletes all BTreeNodes in this BTree */ public void clear() throws IOException { try { lock.writeLock().lock(); BTreeNode rootNode = getRoot(); if (rootNode != null) rootNode.delete(); _entries = 0; modCount++; } finally { lock.writeLock().unlock(); } } /** * Used for debugging and testing only. Populates the 'out' list with * the recids of all child nodes in the BTree. * * @param out * @throws IOException */ void dumpChildNodeRecIDs(List out) throws IOException { BTreeNode root = getRoot(); if (root != null) { out.add(root._recid); root.dumpChildNodeRecIDs(out, _height); } } public boolean hasValues() { return hasValues; } /** * Browser to traverse a collection of tuples. The browser allows for * forward and reverse order traversal. * * */ static interface BTreeTupleBrowser { /** * Get the next tuple. * * @param tuple Tuple into which values are copied. * @return True if values have been copied in tuple, or false if there is no next tuple. */ boolean getNext(BTree.BTreeTuple tuple) throws IOException; /** * Get the previous tuple. * * @param tuple Tuple into which values are copied. * @return True if values have been copied in tuple, or false if there is no previous tuple. */ boolean getPrevious(BTree.BTreeTuple tuple) throws IOException; /** * Remove an entry with given key, and increases browsers expectedModCount * This method is here to support 'ConcurrentModificationException' on Map interface. * * @param key */ void remove(K key) throws IOException; } /** * Tuple consisting of a key-value pair. */ static final class BTreeTuple { K key; V value; BTreeTuple() { // empty } BTreeTuple(K key, V value) { this.key = key; this.value = value; } } }