Version 1.1.0.0 update

This commit is contained in:
transcoder
2014-02-12 04:09:36 -07:00
parent 58decdbca7
commit f03acc635f
481 changed files with 156398 additions and 57772 deletions

View File

@@ -3,6 +3,7 @@
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include "addrman.h"
#include "hash.h"
using namespace std;
@@ -187,7 +188,7 @@ int CAddrMan::ShrinkNew(int nUBucket)
}
assert(mapInfo.count(nOldest) == 1);
CAddrInfo &info = mapInfo[nOldest];
if (--info.nRefCount == 0)
if (--info.nRefCount == 0)
{
SwapRandom(info.nRandomPos, vRandom.size()-1);
vRandom.pop_back();
@@ -241,7 +242,7 @@ void CAddrMan::MakeTried(CAddrInfo& info, int nId, int nOrigin)
infoOld.nRefCount = 1;
// do not update nTried, as we are going to move something else there immediately
// check whether there is place in that one,
// check whether there is place in that one,
if (vNew.size() < ADDRMAN_NEW_BUCKET_SIZE)
{
// if so, move it back there
@@ -410,7 +411,7 @@ CAddress CAddrMan::Select_(int nUnkBias)
fChanceFactor *= 1.2;
}
} else {
// use an new node
// use a new node
double fChanceFactor = 1.0;
while(1)
{

View File

@@ -117,7 +117,7 @@ public:
// * Bucket selection is based on cryptographic hashing, using a randomly-generated 256-bit key, which should not
// be observable by adversaries.
// * Several indexes are kept for high performance. Defining DEBUG_ADDRMAN will introduce frequent (and expensive)
// consistency checks for the entire datastructure.
// consistency checks for the entire data structure.
// total number of buckets for tried addresses
#define ADDRMAN_TRIED_BUCKET_COUNT 64
@@ -174,13 +174,13 @@ private:
// last used nId
int nIdCount;
// table with information about all nId's
// table with information about all nIds
std::map<int, CAddrInfo> mapInfo;
// find an nId based on its network address
std::map<CNetAddr, int> mapAddr;
// randomly-ordered vector of all nId's
// randomly-ordered vector of all nIds
std::vector<int> vRandom;
// number of "tried" entries
@@ -214,7 +214,7 @@ protected:
// This is the only place where actual deletes occur.
// They are never deleted while in the "tried" table, only possibly evicted back to the "new" table.
int ShrinkNew(int nUBucket);
// Move an entry from the "new" table(s) to the "tried" table
// @pre vvUnkown[nOrigin].count(nId) != 0
void MakeTried(CAddrInfo& info, int nId, int nOrigin);
@@ -253,8 +253,8 @@ public:
// * nNew
// * nTried
// * number of "new" buckets
// * all nNew addrinfo's in vvNew
// * all nTried addrinfo's in vvTried
// * all nNew addrinfos in vvNew
// * all nTried addrinfos in vvTried
// * for each bucket:
// * number of elements
// * for each element: index

258
src/alert.cpp Normal file
View File

@@ -0,0 +1,258 @@
//
// Alert system
//
#include <algorithm>
#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/replace.hpp>
#include <boost/foreach.hpp>
#include <map>
#include "alert.h"
#include "key.h"
#include "net.h"
#include "sync.h"
#include "ui_interface.h"
using namespace std;
map<uint256, CAlert> mapAlerts;
CCriticalSection cs_mapAlerts;
static const char* pszMainKey = "040184710fa689ad5023690c80f3a49c8f13f8d45b8c857fbcbc8bc4a8e4d3eb4b10f4d4604fa08dce601aaf0f470216fe1b51850b4acf21b179c45070ac7b03a9";
static const char* pszTestKey = "04302390343f91cc401d56d68b123028bf52e5fca1939df127f63c6467cdf9c8e2c14b61104cf817d0b780da337893ecc4aaff1309e536162dabbdb45200ca2b0a";
void CUnsignedAlert::SetNull()
{
nVersion = 1;
nRelayUntil = 0;
nExpiration = 0;
nID = 0;
nCancel = 0;
setCancel.clear();
nMinVer = 0;
nMaxVer = 0;
setSubVer.clear();
nPriority = 0;
strComment.clear();
strStatusBar.clear();
strReserved.clear();
}
std::string CUnsignedAlert::ToString() const
{
std::string strSetCancel;
BOOST_FOREACH(int n, setCancel)
strSetCancel += strprintf("%d ", n);
std::string strSetSubVer;
BOOST_FOREACH(std::string str, setSubVer)
strSetSubVer += "\"" + str + "\" ";
return strprintf(
"CAlert(\n"
" nVersion = %d\n"
" nRelayUntil = %"PRI64d"\n"
" nExpiration = %"PRI64d"\n"
" nID = %d\n"
" nCancel = %d\n"
" setCancel = %s\n"
" nMinVer = %d\n"
" nMaxVer = %d\n"
" setSubVer = %s\n"
" nPriority = %d\n"
" strComment = \"%s\"\n"
" strStatusBar = \"%s\"\n"
")\n",
nVersion,
nRelayUntil,
nExpiration,
nID,
nCancel,
strSetCancel.c_str(),
nMinVer,
nMaxVer,
strSetSubVer.c_str(),
nPriority,
strComment.c_str(),
strStatusBar.c_str());
}
void CUnsignedAlert::print() const
{
printf("%s", ToString().c_str());
}
void CAlert::SetNull()
{
CUnsignedAlert::SetNull();
vchMsg.clear();
vchSig.clear();
}
bool CAlert::IsNull() const
{
return (nExpiration == 0);
}
uint256 CAlert::GetHash() const
{
return Hash(this->vchMsg.begin(), this->vchMsg.end());
}
bool CAlert::IsInEffect() const
{
return (GetAdjustedTime() < nExpiration);
}
bool CAlert::Cancels(const CAlert& alert) const
{
if (!IsInEffect())
return false; // this was a no-op before 31403
return (alert.nID <= nCancel || setCancel.count(alert.nID));
}
bool CAlert::AppliesTo(int nVersion, std::string strSubVerIn) const
{
// TODO: rework for client-version-embedded-in-strSubVer ?
return (IsInEffect() &&
nMinVer <= nVersion && nVersion <= nMaxVer &&
(setSubVer.empty() || setSubVer.count(strSubVerIn)));
}
bool CAlert::AppliesToMe() const
{
return AppliesTo(PROTOCOL_VERSION, FormatSubVersion(CLIENT_NAME, CLIENT_VERSION, std::vector<std::string>()));
}
bool CAlert::RelayTo(CNode* pnode) const
{
if (!IsInEffect())
return false;
// returns true if wasn't already contained in the set
if (pnode->setKnown.insert(GetHash()).second)
{
if (AppliesTo(pnode->nVersion, pnode->strSubVer) ||
AppliesToMe() ||
GetAdjustedTime() < nRelayUntil)
{
pnode->PushMessage("alert", *this);
return true;
}
}
return false;
}
bool CAlert::CheckSignature() const
{
CPubKey key(ParseHex(fTestNet ? pszTestKey : pszMainKey));
if (!key.Verify(Hash(vchMsg.begin(), vchMsg.end()), vchSig))
return error("CAlert::CheckSignature() : verify signature failed");
// Now unserialize the data
CDataStream sMsg(vchMsg, SER_NETWORK, PROTOCOL_VERSION);
sMsg >> *(CUnsignedAlert*)this;
return true;
}
CAlert CAlert::getAlertByHash(const uint256 &hash)
{
CAlert retval;
{
LOCK(cs_mapAlerts);
map<uint256, CAlert>::iterator mi = mapAlerts.find(hash);
if(mi != mapAlerts.end())
retval = mi->second;
}
return retval;
}
bool CAlert::ProcessAlert(bool fThread)
{
if (!CheckSignature())
return false;
if (!IsInEffect())
return false;
// alert.nID=max is reserved for if the alert key is
// compromised. It must have a pre-defined message,
// must never expire, must apply to all versions,
// and must cancel all previous
// alerts or it will be ignored (so an attacker can't
// send an "everything is OK, don't panic" version that
// cannot be overridden):
int maxInt = std::numeric_limits<int>::max();
if (nID == maxInt)
{
if (!(
nExpiration == maxInt &&
nCancel == (maxInt-1) &&
nMinVer == 0 &&
nMaxVer == maxInt &&
setSubVer.empty() &&
nPriority == maxInt &&
strStatusBar == "URGENT: Alert key compromised, upgrade required"
))
return false;
}
{
LOCK(cs_mapAlerts);
// Cancel previous alerts
for (map<uint256, CAlert>::iterator mi = mapAlerts.begin(); mi != mapAlerts.end();)
{
const CAlert& alert = (*mi).second;
if (Cancels(alert))
{
printf("cancelling alert %d\n", alert.nID);
uiInterface.NotifyAlertChanged((*mi).first, CT_DELETED);
mapAlerts.erase(mi++);
}
else if (!alert.IsInEffect())
{
printf("expiring alert %d\n", alert.nID);
uiInterface.NotifyAlertChanged((*mi).first, CT_DELETED);
mapAlerts.erase(mi++);
}
else
mi++;
}
// Check if this alert has been cancelled
BOOST_FOREACH(PAIRTYPE(const uint256, CAlert)& item, mapAlerts)
{
const CAlert& alert = item.second;
if (alert.Cancels(*this))
{
printf("alert already cancelled by %d\n", alert.nID);
return false;
}
}
// Add to mapAlerts
mapAlerts.insert(make_pair(GetHash(), *this));
// Notify UI and -alertnotify if it applies to me
if(AppliesToMe())
{
uiInterface.NotifyAlertChanged(GetHash(), CT_NEW);
std::string strCmd = GetArg("-alertnotify", "");
if (!strCmd.empty())
{
// Alert text should be plain ascii coming from a trusted source, but to
// be safe we first strip anything not in safeChars, then add single quotes around
// the whole string before passing it to the shell:
std::string singleQuote("'");
std::string safeStatus = SanitizeString(strStatusBar);
safeStatus = singleQuote+safeStatus+singleQuote;
boost::replace_all(strCmd, "%s", safeStatus);
if (fThread)
boost::thread t(runCommand, strCmd); // thread runs free
else
runCommand(strCmd);
}
}
}
printf("accepted alert %d, AppliesToMe()=%d\n", nID, AppliesToMe());
return true;
}

102
src/alert.h Normal file
View File

@@ -0,0 +1,102 @@
// Copyright (c) 2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef _BITCOINALERT_H_
#define _BITCOINALERT_H_ 1
#include <set>
#include <string>
#include "uint256.h"
#include "util.h"
class CNode;
/** Alerts are for notifying old versions if they become too obsolete and
* need to upgrade. The message is displayed in the status bar.
* Alert messages are broadcast as a vector of signed data. Unserializing may
* not read the entire buffer if the alert is for a newer version, but older
* versions can still relay the original data.
*/
class CUnsignedAlert
{
public:
int nVersion;
int64 nRelayUntil; // when newer nodes stop relaying to newer nodes
int64 nExpiration;
int nID;
int nCancel;
std::set<int> setCancel;
int nMinVer; // lowest version inclusive
int nMaxVer; // highest version inclusive
std::set<std::string> setSubVer; // empty matches all
int nPriority;
// Actions
std::string strComment;
std::string strStatusBar;
std::string strReserved;
IMPLEMENT_SERIALIZE
(
READWRITE(this->nVersion);
nVersion = this->nVersion;
READWRITE(nRelayUntil);
READWRITE(nExpiration);
READWRITE(nID);
READWRITE(nCancel);
READWRITE(setCancel);
READWRITE(nMinVer);
READWRITE(nMaxVer);
READWRITE(setSubVer);
READWRITE(nPriority);
READWRITE(strComment);
READWRITE(strStatusBar);
READWRITE(strReserved);
)
void SetNull();
std::string ToString() const;
void print() const;
};
/** An alert is a combination of a serialized CUnsignedAlert and a signature. */
class CAlert : public CUnsignedAlert
{
public:
std::vector<unsigned char> vchMsg;
std::vector<unsigned char> vchSig;
CAlert()
{
SetNull();
}
IMPLEMENT_SERIALIZE
(
READWRITE(vchMsg);
READWRITE(vchSig);
)
void SetNull();
bool IsNull() const;
uint256 GetHash() const;
bool IsInEffect() const;
bool Cancels(const CAlert& alert) const;
bool AppliesTo(int nVersion, std::string strSubVerIn) const;
bool AppliesToMe() const;
bool RelayTo(CNode* pnode) const;
bool CheckSignature() const;
bool ProcessAlert(bool fThread = true);
/*
* Get copy of (active) alert object by hash. Returns a null alert if it is not found.
*/
static CAlert getAlertByHash(const uint256 &hash);
};
#endif

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_ALLOCATORS_H
@@ -9,6 +7,9 @@
#include <string.h>
#include <string>
#include <boost/thread/mutex.hpp>
#include <map>
#include <openssl/crypto.h> // for OPENSSL_cleanse()
#ifdef WIN32
#ifdef _WIN32_WINNT
@@ -24,23 +25,169 @@
// Note that VirtualLock does not provide this as a guarantee on Windows,
// but, in practice, memory that has been VirtualLock'd almost never gets written to
// the pagefile except in rare circumstances where memory is extremely low.
#define mlock(p, n) VirtualLock((p), (n));
#define munlock(p, n) VirtualUnlock((p), (n));
#else
#include <sys/mman.h>
#include <limits.h>
/* This comes from limits.h if it's not defined there set a sane default */
#ifndef PAGESIZE
#include <unistd.h>
#define PAGESIZE sysconf(_SC_PAGESIZE)
#include <limits.h> // for PAGESIZE
#include <unistd.h> // for sysconf
#endif
#define mlock(a,b) \
mlock(((void *)(((size_t)(a)) & (~((PAGESIZE)-1)))),\
(((((size_t)(a)) + (b) - 1) | ((PAGESIZE) - 1)) + 1) - (((size_t)(a)) & (~((PAGESIZE) - 1))))
#define munlock(a,b) \
munlock(((void *)(((size_t)(a)) & (~((PAGESIZE)-1)))),\
(((((size_t)(a)) + (b) - 1) | ((PAGESIZE) - 1)) + 1) - (((size_t)(a)) & (~((PAGESIZE) - 1))))
/**
* Thread-safe class to keep track of locked (ie, non-swappable) memory pages.
*
* Memory locks do not stack, that is, pages which have been locked several times by calls to mlock()
* will be unlocked by a single call to munlock(). This can result in keying material ending up in swap when
* those functions are used naively. This class simulates stacking memory locks by keeping a counter per page.
*
* @note By using a map from each page base address to lock count, this class is optimized for
* small objects that span up to a few pages, mostly smaller than a page. To support large allocations,
* something like an interval tree would be the preferred data structure.
*/
template <class Locker> class LockedPageManagerBase
{
public:
LockedPageManagerBase(size_t page_size):
page_size(page_size)
{
// Determine bitmask for extracting page from address
assert(!(page_size & (page_size-1))); // size must be power of two
page_mask = ~(page_size - 1);
}
// For all pages in affected range, increase lock count
void LockRange(void *p, size_t size)
{
boost::mutex::scoped_lock lock(mutex);
if(!size) return;
const size_t base_addr = reinterpret_cast<size_t>(p);
const size_t start_page = base_addr & page_mask;
const size_t end_page = (base_addr + size - 1) & page_mask;
for(size_t page = start_page; page <= end_page; page += page_size)
{
Histogram::iterator it = histogram.find(page);
if(it == histogram.end()) // Newly locked page
{
locker.Lock(reinterpret_cast<void*>(page), page_size);
histogram.insert(std::make_pair(page, 1));
}
else // Page was already locked; increase counter
{
it->second += 1;
}
}
}
// For all pages in affected range, decrease lock count
void UnlockRange(void *p, size_t size)
{
boost::mutex::scoped_lock lock(mutex);
if(!size) return;
const size_t base_addr = reinterpret_cast<size_t>(p);
const size_t start_page = base_addr & page_mask;
const size_t end_page = (base_addr + size - 1) & page_mask;
for(size_t page = start_page; page <= end_page; page += page_size)
{
Histogram::iterator it = histogram.find(page);
assert(it != histogram.end()); // Cannot unlock an area that was not locked
// Decrease counter for page, when it is zero, the page will be unlocked
it->second -= 1;
if(it->second == 0) // Nothing on the page anymore that keeps it locked
{
// Unlock page and remove the count from histogram
locker.Unlock(reinterpret_cast<void*>(page), page_size);
histogram.erase(it);
}
}
}
// Get number of locked pages for diagnostics
int GetLockedPageCount()
{
boost::mutex::scoped_lock lock(mutex);
return histogram.size();
}
private:
Locker locker;
boost::mutex mutex;
size_t page_size, page_mask;
// map of page base address to lock count
typedef std::map<size_t,int> Histogram;
Histogram histogram;
};
/** Determine system page size in bytes */
static inline size_t GetSystemPageSize()
{
size_t page_size;
#if defined(WIN32)
SYSTEM_INFO sSysInfo;
GetSystemInfo(&sSysInfo);
page_size = sSysInfo.dwPageSize;
#elif defined(PAGESIZE) // defined in limits.h
page_size = PAGESIZE;
#else // assume some POSIX OS
page_size = sysconf(_SC_PAGESIZE);
#endif
return page_size;
}
/**
* OS-dependent memory page locking/unlocking.
* Defined as policy class to make stubbing for test possible.
*/
class MemoryPageLocker
{
public:
/** Lock memory pages.
* addr and len must be a multiple of the system page size
*/
bool Lock(const void *addr, size_t len)
{
#ifdef WIN32
return VirtualLock(const_cast<void*>(addr), len);
#else
return mlock(addr, len) == 0;
#endif
}
/** Unlock memory pages.
* addr and len must be a multiple of the system page size
*/
bool Unlock(const void *addr, size_t len)
{
#ifdef WIN32
return VirtualUnlock(const_cast<void*>(addr), len);
#else
return munlock(addr, len) == 0;
#endif
}
};
/**
* Singleton class to keep track of locked (ie, non-swappable) memory pages, for use in
* std::allocator templates.
*/
class LockedPageManager: public LockedPageManagerBase<MemoryPageLocker>
{
public:
static LockedPageManager instance; // instantiated in util.cpp
private:
LockedPageManager():
LockedPageManagerBase<MemoryPageLocker>(GetSystemPageSize())
{}
};
//
// Functions for directly locking/unlocking memory objects.
// Intended for non-dynamically allocated structures.
//
template<typename T> void LockObject(const T &t) {
LockedPageManager::instance.LockRange((void*)(&t), sizeof(T));
}
template<typename T> void UnlockObject(const T &t) {
OPENSSL_cleanse((void*)(&t), sizeof(T));
LockedPageManager::instance.UnlockRange((void*)(&t), sizeof(T));
}
//
// Allocator that locks its contents from being paged
@@ -71,7 +218,7 @@ struct secure_allocator : public std::allocator<T>
T *p;
p = std::allocator<T>::allocate(n, hint);
if (p != NULL)
mlock(p, sizeof(T) * n);
LockedPageManager::instance.LockRange(p, sizeof(T) * n);
return p;
}
@@ -79,8 +226,8 @@ struct secure_allocator : public std::allocator<T>
{
if (p != NULL)
{
memset(p, 0, sizeof(T) * n);
munlock(p, sizeof(T) * n);
OPENSSL_cleanse(p, sizeof(T) * n);
LockedPageManager::instance.UnlockRange(p, sizeof(T) * n);
}
std::allocator<T>::deallocate(p, n);
}
@@ -113,7 +260,7 @@ struct zero_after_free_allocator : public std::allocator<T>
void deallocate(T* p, std::size_t n)
{
if (p != NULL)
memset(p, 0, sizeof(T) * n);
OPENSSL_cleanse(p, sizeof(T) * n);
std::allocator<T>::deallocate(p, n);
}
};

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin Developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
@@ -12,16 +10,18 @@
// could be used to create visually identical looking account numbers.
// - A string with non-alphanumeric characters is not as easily accepted as an account number.
// - E-mail usually won't line-break if there's no punctuation to break at.
// - Doubleclicking selects the whole number as one word if it's all alphanumeric.
// - Double-clicking selects the whole number as one word if it's all alphanumeric.
//
#ifndef BITCOIN_BASE58_H
#define BITCOIN_BASE58_H
#include <string>
#include <vector>
#include "bignum.h"
#include "key.h"
#include "script.h"
#include "allocators.h"
static const char* pszBase58 = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";
@@ -180,7 +180,8 @@ protected:
unsigned char nVersion;
// the actually encoded data
std::vector<unsigned char> vchData;
typedef std::vector<unsigned char, zero_after_free_allocator<unsigned char> > vector_uchar;
vector_uchar vchData;
CBase58Data()
{
@@ -188,13 +189,6 @@ protected:
vchData.clear();
}
~CBase58Data()
{
// zero the memory, as it may contain sensitive data
if (!vchData.empty())
memset(&vchData[0], 0, vchData.size());
}
void SetData(int nVersionIn, const void* pdata, size_t nSize)
{
nVersion = nVersionIn;
@@ -223,7 +217,7 @@ public:
vchData.resize(vchTemp.size() - 1);
if (!vchData.empty())
memcpy(&vchData[0], &vchTemp[1], vchData.size());
memset(&vchTemp[0], 0, vchTemp.size());
OPENSSL_cleanse(&vchTemp[0], vchData.size());
return true;
}
@@ -410,21 +404,19 @@ public:
PRIVKEY_ADDRESS_TEST = CBitcoinAddress::PUBKEY_ADDRESS_TEST + 128,
};
void SetSecret(const CSecret& vchSecret, bool fCompressed)
{
assert(vchSecret.size() == 32);
SetData(fTestNet ? PRIVKEY_ADDRESS_TEST : PRIVKEY_ADDRESS, &vchSecret[0], vchSecret.size());
if (fCompressed)
void SetKey(const CKey& vchSecret)
{
assert(vchSecret.IsValid());
SetData(fTestNet ? PRIVKEY_ADDRESS_TEST : PRIVKEY_ADDRESS, vchSecret.begin(), vchSecret.size());
if (vchSecret.IsCompressed())
vchData.push_back(1);
}
CSecret GetSecret(bool &fCompressedOut)
CKey GetKey()
{
CSecret vchSecret;
vchSecret.resize(32);
memcpy(&vchSecret[0], &vchData[0], 32);
fCompressedOut = vchData.size() == 33;
return vchSecret;
CKey ret;
ret.Set(&vchData[0], &vchData[32], vchData.size() > 32 && vchData[32] == 1);
return ret;
}
bool IsValid() const
@@ -455,9 +447,9 @@ public:
return SetString(strSecret.c_str());
}
CBitcoinSecret(const CSecret& vchSecret, bool fCompressed)
CBitcoinSecret(const CKey& vchSecret)
{
SetSecret(vchSecret, fCompressed);
SetKey(vchSecret);
}
CBitcoinSecret()
@@ -465,4 +457,4 @@ public:
}
};
#endif
#endif // BITCOIN_BASE58_H

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_BIGNUM_H
@@ -133,7 +131,9 @@ public:
if (sn < (int64)0)
{
// Since the minimum signed integer cannot be represented as positive so long as its type is signed, and it's not well-defined what happens if you make it unsigned before negating it, we instead increment the negative integer by 1, convert it, then increment the (now positive) unsigned integer by 1 to compensate
// Since the minimum signed integer cannot be represented as positive so long as its type is signed,
// and it's not well-defined what happens if you make it unsigned before negating it,
// we instead increment the negative integer by 1, convert it, then increment the (now positive) unsigned integer by 1 to compensate
n = -(sn + 1);
++n;
fNegative = true;
@@ -222,7 +222,7 @@ public:
BN_mpi2bn(pch, p - pch, this);
}
uint256 getuint256()
uint256 getuint256() const
{
unsigned int nSize = BN_bn2mpi(this, NULL);
if (nSize < 4)
@@ -264,28 +264,68 @@ public:
return vch;
}
// The "compact" format is a representation of a whole
// number N using an unsigned 32bit number similar to a
// floating point format.
// The most significant 8 bits are the unsigned exponent of base 256.
// This exponent can be thought of as "number of bytes of N".
// The lower 23 bits are the mantissa.
// Bit number 24 (0x800000) represents the sign of N.
// N = (-1^sign) * mantissa * 256^(exponent-3)
//
// Satoshi's original implementation used BN_bn2mpi() and BN_mpi2bn().
// MPI uses the most significant bit of the first byte as sign.
// Thus 0x1234560000 is compact (0x05123456)
// and 0xc0de000000 is compact (0x0600c0de)
// (0x05c0de00) would be -0x40de000000
//
// Bitcoin only uses this "compact" format for encoding difficulty
// targets, which are unsigned 256bit quantities. Thus, all the
// complexities of the sign bit and using base 256 are probably an
// implementation accident.
//
// This implementation directly uses shifts instead of going
// through an intermediate MPI representation.
CBigNum& SetCompact(unsigned int nCompact)
{
unsigned int nSize = nCompact >> 24;
std::vector<unsigned char> vch(4 + nSize);
vch[3] = nSize;
if (nSize >= 1) vch[4] = (nCompact >> 16) & 0xff;
if (nSize >= 2) vch[5] = (nCompact >> 8) & 0xff;
if (nSize >= 3) vch[6] = (nCompact >> 0) & 0xff;
BN_mpi2bn(&vch[0], vch.size(), this);
bool fNegative =(nCompact & 0x00800000) != 0;
unsigned int nWord = nCompact & 0x007fffff;
if (nSize <= 3)
{
nWord >>= 8*(3-nSize);
BN_set_word(this, nWord);
}
else
{
BN_set_word(this, nWord);
BN_lshift(this, this, 8*(nSize-3));
}
BN_set_negative(this, fNegative);
return *this;
}
unsigned int GetCompact() const
{
unsigned int nSize = BN_bn2mpi(this, NULL);
std::vector<unsigned char> vch(nSize);
nSize -= 4;
BN_bn2mpi(this, &vch[0]);
unsigned int nCompact = nSize << 24;
if (nSize >= 1) nCompact |= (vch[4] << 16);
if (nSize >= 2) nCompact |= (vch[5] << 8);
if (nSize >= 3) nCompact |= (vch[6] << 0);
unsigned int nSize = BN_num_bytes(this);
unsigned int nCompact = 0;
if (nSize <= 3)
nCompact = BN_get_word(this) << 8*(3-nSize);
else
{
CBigNum bn;
BN_rshift(&bn, this, 8*(nSize-3));
nCompact = BN_get_word(&bn);
}
// The 0x00800000 bit denotes the sign.
// Thus, if it is already set, divide the mantissa by 256 and increase the exponent.
if (nCompact & 0x00800000)
{
nCompact >>= 8;
nSize++;
}
nCompact |= nSize << 24;
nCompact |= (BN_is_negative(this) ? 0x00800000 : 0);
return nCompact;
}
@@ -307,7 +347,7 @@ public:
psz++;
// hex string to bignum
static signed char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 };
static const signed char phexdigit[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xa,0xb,0xc,0xd,0xe,0xf,0,0,0,0,0,0,0,0,0 };
*this = 0;
while (isxdigit(*psz))
{
@@ -418,7 +458,7 @@ public:
CBigNum& operator>>=(unsigned int shift)
{
// Note: BN_rshift segfaults on 64-bit if 2^shift is greater than the number
// if built on ubuntu 9.04 or 9.10, probably depends on version of openssl
// if built on ubuntu 9.04 or 9.10, probably depends on version of OpenSSL
CBigNum a = 1;
a <<= shift;
if (BN_cmp(&a, this) > 0)

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
@@ -12,13 +10,66 @@
#include <list>
#include <map>
class CBlockIndex;
class CReserveKey;
#include "json/json_spirit_reader_template.h"
#include "json/json_spirit_writer_template.h"
#include "json/json_spirit_utils.h"
#include "util.h"
// HTTP status codes
enum HTTPStatusCode
{
HTTP_OK = 200,
HTTP_BAD_REQUEST = 400,
HTTP_UNAUTHORIZED = 401,
HTTP_FORBIDDEN = 403,
HTTP_NOT_FOUND = 404,
HTTP_INTERNAL_SERVER_ERROR = 500,
};
// Bitcoin RPC error codes
enum RPCErrorCode
{
// Standard JSON-RPC 2.0 errors
RPC_INVALID_REQUEST = -32600,
RPC_METHOD_NOT_FOUND = -32601,
RPC_INVALID_PARAMS = -32602,
RPC_INTERNAL_ERROR = -32603,
RPC_PARSE_ERROR = -32700,
// General application defined errors
RPC_MISC_ERROR = -1, // std::exception thrown in command handling
RPC_FORBIDDEN_BY_SAFE_MODE = -2, // Server is in safe mode, and command is not allowed in safe mode
RPC_TYPE_ERROR = -3, // Unexpected type was passed as parameter
RPC_INVALID_ADDRESS_OR_KEY = -5, // Invalid address or key
RPC_OUT_OF_MEMORY = -7, // Ran out of memory during operation
RPC_INVALID_PARAMETER = -8, // Invalid, missing or duplicate parameter
RPC_DATABASE_ERROR = -20, // Database error
RPC_DESERIALIZATION_ERROR = -22, // Error parsing or validating structure in raw format
// P2P client errors
RPC_CLIENT_NOT_CONNECTED = -9, // Bitcoin is not connected
RPC_CLIENT_IN_INITIAL_DOWNLOAD = -10, // Still downloading initial blocks
// Wallet errors
RPC_WALLET_ERROR = -4, // Unspecified problem with wallet (key not found etc.)
RPC_WALLET_INSUFFICIENT_FUNDS = -6, // Not enough funds in wallet or account
RPC_WALLET_INVALID_ACCOUNT_NAME = -11, // Invalid account name
RPC_WALLET_KEYPOOL_RAN_OUT = -12, // Keypool ran out, call keypoolrefill first
RPC_WALLET_UNLOCK_NEEDED = -13, // Enter the wallet passphrase with walletpassphrase first
RPC_WALLET_PASSPHRASE_INCORRECT = -14, // The wallet passphrase entered was incorrect
RPC_WALLET_WRONG_ENC_STATE = -15, // Command given in wrong wallet encryption state (encrypting an encrypted wallet etc.)
RPC_WALLET_ENCRYPTION_FAILED = -16, // Failed to encrypt the wallet
RPC_WALLET_ALREADY_UNLOCKED = -17, // Wallet is already unlocked
};
json_spirit::Object JSONRPCError(int code, const std::string& message);
void ThreadRPCServer(void* parg);
void StartRPCThreads();
void StopRPCThreads();
int CommandLineRPC(int argc, char *argv[]);
/** Convert parameter values for RPC call from strings to command-specific JSON objects. */
@@ -30,13 +81,13 @@ json_spirit::Array RPCConvertValues(const std::string &strMethod, const std::vec
Use like: RPCTypeCheck(params, boost::assign::list_of(str_type)(int_type)(obj_type));
*/
void RPCTypeCheck(const json_spirit::Array& params,
const std::list<json_spirit::Value_type>& typesExpected);
const std::list<json_spirit::Value_type>& typesExpected, bool fAllowNull=false);
/*
Check for expected keys/value types in an Object.
Use like: RPCTypeCheck(object, boost::assign::map_list_of("name", str_type)("value", int_type));
*/
void RPCTypeCheck(const json_spirit::Object& o,
const std::map<std::string, json_spirit::Value_type>& typesExpected);
const std::map<std::string, json_spirit::Value_type>& typesExpected, bool fAllowNull=false);
typedef json_spirit::Value(*rpcfn_type)(const json_spirit::Array& params, bool fHelp);
@@ -46,6 +97,8 @@ public:
std::string name;
rpcfn_type actor;
bool okSafeMode;
bool threadSafe;
bool reqWallet;
};
/**
@@ -72,4 +125,85 @@ public:
extern const CRPCTable tableRPC;
extern void InitRPCMining();
extern void ShutdownRPCMining();
extern int64 nWalletUnlockTime;
extern int64 AmountFromValue(const json_spirit::Value& value);
extern json_spirit::Value ValueFromAmount(int64 amount);
extern double GetDifficulty(const CBlockIndex* blockindex = NULL);
extern std::string HexBits(unsigned int nBits);
extern std::string HelpRequiringPassphrase();
extern void EnsureWalletIsUnlocked();
extern json_spirit::Value getconnectioncount(const json_spirit::Array& params, bool fHelp); // in rpcnet.cpp
extern json_spirit::Value getpeerinfo(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value addnode(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getaddednodeinfo(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value dumpprivkey(const json_spirit::Array& params, bool fHelp); // in rpcdump.cpp
extern json_spirit::Value importprivkey(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getgenerate(const json_spirit::Array& params, bool fHelp); // in rpcmining.cpp
extern json_spirit::Value setgenerate(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getnetworkhashps(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value gethashespersec(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getmininginfo(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getworkex(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getwork(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getblocktemplate(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value submitblock(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getnewaddress(const json_spirit::Array& params, bool fHelp); // in rpcwallet.cpp
extern json_spirit::Value getaccountaddress(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value setaccount(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getaccount(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getaddressesbyaccount(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value sendtoaddress(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value signmessage(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value verifymessage(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getreceivedbyaddress(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getreceivedbyaccount(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getbalance(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value movecmd(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value sendfrom(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value sendmany(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value addmultisigaddress(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value createmultisig(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listreceivedbyaddress(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listreceivedbyaccount(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listtransactions(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listaddressgroupings(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listaccounts(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listsinceblock(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value gettransaction(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value backupwallet(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value keypoolrefill(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value walletpassphrase(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value walletpassphrasechange(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value walletlock(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value encryptwallet(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value validateaddress(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getinfo(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getrawtransaction(const json_spirit::Array& params, bool fHelp); // in rcprawtransaction.cpp
extern json_spirit::Value listunspent(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value lockunspent(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value listlockunspent(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value createrawtransaction(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value decoderawtransaction(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value signrawtransaction(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value sendrawtransaction(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getblockcount(const json_spirit::Array& params, bool fHelp); // in rpcblockchain.cpp
extern json_spirit::Value getbestblockhash(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getdifficulty(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value settxfee(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value setmininput(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getrawmempool(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getblockhash(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value getblock(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value gettxoutsetinfo(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value gettxout(const json_spirit::Array& params, bool fHelp);
extern json_spirit::Value verifychain(const json_spirit::Array& params, bool fHelp);
#endif

182
src/bloom.cpp Normal file
View File

@@ -0,0 +1,182 @@
// Copyright (c) 2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <math.h>
#include <stdlib.h>
#include "bloom.h"
#include "main.h"
#include "script.h"
#define LN2SQUARED 0.4804530139182014246671025263266649717305529515945455
#define LN2 0.6931471805599453094172321214581765680755001343602552
using namespace std;
static const unsigned char bit_mask[8] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) :
// The ideal size for a bloom filter with a given number of elements and false positive rate is:
// - nElements * log(fp rate) / ln(2)^2
// We ignore filter parameters which will create a bloom filter larger than the protocol limits
vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
// The ideal number of hash functions is filter size * ln(2) / number of elements
// Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
// See http://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
isFull(false),
isEmpty(false),
nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
nTweak(nTweakIn),
nFlags(nFlagsIn)
{
}
inline unsigned int CBloomFilter::Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const
{
// 0xFBA4C795 chosen as it guarantees a reasonable bit difference between nHashNum values.
return MurmurHash3(nHashNum * 0xFBA4C795 + nTweak, vDataToHash) % (vData.size() * 8);
}
void CBloomFilter::insert(const vector<unsigned char>& vKey)
{
if (isFull)
return;
for (unsigned int i = 0; i < nHashFuncs; i++)
{
unsigned int nIndex = Hash(i, vKey);
// Sets bit nIndex of vData
vData[nIndex >> 3] |= bit_mask[7 & nIndex];
}
isEmpty = false;
}
void CBloomFilter::insert(const COutPoint& outpoint)
{
CDataStream stream(SER_NETWORK, PROTOCOL_VERSION);
stream << outpoint;
vector<unsigned char> data(stream.begin(), stream.end());
insert(data);
}
void CBloomFilter::insert(const uint256& hash)
{
vector<unsigned char> data(hash.begin(), hash.end());
insert(data);
}
bool CBloomFilter::contains(const vector<unsigned char>& vKey) const
{
if (isFull)
return true;
if (isEmpty)
return false;
for (unsigned int i = 0; i < nHashFuncs; i++)
{
unsigned int nIndex = Hash(i, vKey);
// Checks bit nIndex of vData
if (!(vData[nIndex >> 3] & bit_mask[7 & nIndex]))
return false;
}
return true;
}
bool CBloomFilter::contains(const COutPoint& outpoint) const
{
CDataStream stream(SER_NETWORK, PROTOCOL_VERSION);
stream << outpoint;
vector<unsigned char> data(stream.begin(), stream.end());
return contains(data);
}
bool CBloomFilter::contains(const uint256& hash) const
{
vector<unsigned char> data(hash.begin(), hash.end());
return contains(data);
}
bool CBloomFilter::IsWithinSizeConstraints() const
{
return vData.size() <= MAX_BLOOM_FILTER_SIZE && nHashFuncs <= MAX_HASH_FUNCS;
}
bool CBloomFilter::IsRelevantAndUpdate(const CTransaction& tx, const uint256& hash)
{
bool fFound = false;
// Match if the filter contains the hash of tx
// for finding tx when they appear in a block
if (isFull)
return true;
if (isEmpty)
return false;
if (contains(hash))
fFound = true;
for (unsigned int i = 0; i < tx.vout.size(); i++)
{
const CTxOut& txout = tx.vout[i];
// Match if the filter contains any arbitrary script data element in any scriptPubKey in tx
// If this matches, also add the specific output that was matched.
// This means clients don't have to update the filter themselves when a new relevant tx
// is discovered in order to find spending transactions, which avoids round-tripping and race conditions.
CScript::const_iterator pc = txout.scriptPubKey.begin();
vector<unsigned char> data;
while (pc < txout.scriptPubKey.end())
{
opcodetype opcode;
if (!txout.scriptPubKey.GetOp(pc, opcode, data))
break;
if (data.size() != 0 && contains(data))
{
fFound = true;
if ((nFlags & BLOOM_UPDATE_MASK) == BLOOM_UPDATE_ALL)
insert(COutPoint(hash, i));
else if ((nFlags & BLOOM_UPDATE_MASK) == BLOOM_UPDATE_P2PUBKEY_ONLY)
{
txnouttype type;
vector<vector<unsigned char> > vSolutions;
if (Solver(txout.scriptPubKey, type, vSolutions) &&
(type == TX_PUBKEY || type == TX_MULTISIG))
insert(COutPoint(hash, i));
}
break;
}
}
}
if (fFound)
return true;
BOOST_FOREACH(const CTxIn& txin, tx.vin)
{
// Match if the filter contains an outpoint tx spends
if (contains(txin.prevout))
return true;
// Match if the filter contains any arbitrary script data element in any scriptSig in tx
CScript::const_iterator pc = txin.scriptSig.begin();
vector<unsigned char> data;
while (pc < txin.scriptSig.end())
{
opcodetype opcode;
if (!txin.scriptSig.GetOp(pc, opcode, data))
break;
if (data.size() != 0 && contains(data))
return true;
}
}
return false;
}
void CBloomFilter::UpdateEmptyFull()
{
bool full = true;
bool empty = true;
for (unsigned int i = 0; i < vData.size(); i++)
{
full &= vData[i] == 0xff;
empty &= vData[i] == 0;
}
isFull = full;
isEmpty = empty;
}

91
src/bloom.h Normal file
View File

@@ -0,0 +1,91 @@
// Copyright (c) 2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_BLOOM_H
#define BITCOIN_BLOOM_H
#include <vector>
#include "uint256.h"
#include "serialize.h"
class COutPoint;
class CTransaction;
// 20,000 items with fp rate < 0.1% or 10,000 items and <0.0001%
static const unsigned int MAX_BLOOM_FILTER_SIZE = 36000; // bytes
static const unsigned int MAX_HASH_FUNCS = 50;
// First two bits of nFlags control how much IsRelevantAndUpdate actually updates
// The remaining bits are reserved
enum bloomflags
{
BLOOM_UPDATE_NONE = 0,
BLOOM_UPDATE_ALL = 1,
// Only adds outpoints to the filter if the output is a pay-to-pubkey/pay-to-multisig script
BLOOM_UPDATE_P2PUBKEY_ONLY = 2,
BLOOM_UPDATE_MASK = 3,
};
/**
* BloomFilter is a probabilistic filter which SPV clients provide
* so that we can filter the transactions we sends them.
*
* This allows for significantly more efficient transaction and block downloads.
*
* Because bloom filters are probabilistic, an SPV node can increase the false-
* positive rate, making us send them transactions which aren't actually theirs,
* allowing clients to trade more bandwidth for more privacy by obfuscating which
* keys are owned by them.
*/
class CBloomFilter
{
private:
std::vector<unsigned char> vData;
bool isFull;
bool isEmpty;
unsigned int nHashFuncs;
unsigned int nTweak;
unsigned char nFlags;
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
public:
// Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements
// Note that if the given parameters will result in a filter outside the bounds of the protocol limits,
// the filter created will be as close to the given parameters as possible within the protocol limits.
// This will apply if nFPRate is very low or nElements is unreasonably high.
// nTweak is a constant which is added to the seed value passed to the hash function
// It should generally always be a random value (and is largely only exposed for unit testing)
// nFlags should be one of the BLOOM_UPDATE_* enums (not _MASK)
CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak, unsigned char nFlagsIn);
CBloomFilter() : isFull(true) {}
IMPLEMENT_SERIALIZE
(
READWRITE(vData);
READWRITE(nHashFuncs);
READWRITE(nTweak);
READWRITE(nFlags);
)
void insert(const std::vector<unsigned char>& vKey);
void insert(const COutPoint& outpoint);
void insert(const uint256& hash);
bool contains(const std::vector<unsigned char>& vKey) const;
bool contains(const COutPoint& outpoint) const;
bool contains(const uint256& hash) const;
// True if the size is <= MAX_BLOOM_FILTER_SIZE and the number of hash functions is <= MAX_HASH_FUNCS
// (catch a filter which was just deserialized which was too big)
bool IsWithinSizeConstraints() const;
// Also adds any outputs which match the filter to the filter (to match their spending txes)
bool IsRelevantAndUpdate(const CTransaction& tx, const uint256& hash);
// Checks for empty and full filters to avoid wasting cpu
void UpdateEmptyFull();
};
#endif /* BITCOIN_BLOOM_H */

View File

@@ -1,6 +1,4 @@
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
@@ -16,45 +14,143 @@ namespace Checkpoints
{
typedef std::map<int, uint256> MapCheckpoints;
//
// How many times we expect transactions after the last checkpoint to
// be slower. This number is a compromise, as it can't be accurate for
// every system. When reindexing from a fast disk with a slow CPU, it
// can be up to 20, while when downloading from a slow network with a
// fast multicore CPU, it won't be much higher than 1.
static const double fSigcheckVerificationFactor = 5.0;
struct CCheckpointData {
const MapCheckpoints *mapCheckpoints;
int64 nTimeLastCheckpoint;
int64 nTransactionsLastCheckpoint;
double fTransactionsPerDay;
};
// What makes a good checkpoint block?
// + Is surrounded by blocks with reasonable timestamps
// (no blocks before with a timestamp after, none after with
// timestamp before)
// + Contains no strange transactions
//
static MapCheckpoints mapCheckpoints =
boost::assign::map_list_of
( 0, uint256("0x4f46c9af6d88a14114b7dc53a37d81ba4064cda5ae2ede1213ca28fea9b86e9c"))
( 1, uint256("0xd6eea1064e9292f2a38e138bec64797914cff3189fbc6eaa2cc646c1514ccdfe"))
( 777, uint256("0x838c9d331811479957009c39d2e1e723d47ebd21453b3ad92bcabd6cb5dd6ab1"))
( 7777, uint256("0x09ab8471911e0f19e19ef09e1ae212b389f6be3e62f0c140d58c27084da08907"))
( 17777, uint256("0x4dd8c993bdb32b5ff63a1108f86b1cf2b6f3fb3e5a831d3445d944b986addadf"))
( 27777, uint256("0xea7623cc7d0db290b4f2bf5b1d5cbf477bf0c019b667d21dbc5e82e12adcf3ae"))
( 37777, uint256("0xced9dadf09278f513337ebbeee11804b794faddf8f759e583a1e39b0839c238a"))
( 0, uint256("0x4f46c9af6d88a14114b7dc53a37d81ba4064cda5ae2ede1213ca28fea9b86e9c"))
( 1, uint256("0xd6eea1064e9292f2a38e138bec64797914cff3189fbc6eaa2cc646c1514ccdfe"))
( 777, uint256("0x838c9d331811479957009c39d2e1e723d47ebd21453b3ad92bcabd6cb5dd6ab1"))
( 7777, uint256("0x09ab8471911e0f19e19ef09e1ae212b389f6be3e62f0c140d58c27084da08907"))
( 17777, uint256("0x4dd8c993bdb32b5ff63a1108f86b1cf2b6f3fb3e5a831d3445d944b986addadf"))
( 27777, uint256("0xea7623cc7d0db290b4f2bf5b1d5cbf477bf0c019b667d21dbc5e82e12adcf3ae"))
( 37777, uint256("0xced9dadf09278f513337ebbeee11804b794faddf8f759e583a1e39b0839c238a"))
( 47777, uint256("0xb59586840946941d9fd92dc378e0f2bd5fa4da0ebd0636dc009acc2a0174e048"))
( 57777, uint256("0xfdf6fa6f606f3b14e7b2084b0e60a80b769c2ae04fbea62d1af78674d3d1ef76"))
( 67777, uint256("0x5c6732d5ec60454ce68579cf88118923ebdd5dc82b372310bcf7d7303ffddf39"))
( 77777, uint256("0x4fa19ce5f0a050cbf13b3ad194c39351276a7e587ccdef03fa926429b00a0d44"))
( 87777, uint256("0x278d5a4d6ee24dfa263e33032ef0496e0e6d43d9f1918dc269dc83c56d29199c"))
( 97777, uint256("0xf1ab4d14a7a9797d4c22336aea6d83db6a2e5c7f07121d5b67b958e37ea5e384"))
( 107777, uint256("0x421ac0aed1cfb4226d4936fa0ac93c45cce9c6bec7aca4e329a672d8414b23ec"))
( 117777, uint256("0x731713bd84c9fd90cebc6c85c56cd071bd0826841fa1ff193250c9dd897650eb"))
( 127777, uint256("0xd90c1bb94b4aea5dcb4cb4a03de76190e7b1cf2e14fd45dc8a1e317066118491"))
( 137777, uint256("0x97972ecfc9efd19567f49c96f3bfaad15b3cc63ce0aac1780c1276624e41e9a2"))
( 147777, uint256("0xe83328402d705d53074cce0deb98f09b9016357063b522d30dbc450d241a073d"))
( 157777, uint256("0x86b45c6144bcdff90b2846e1669fdd49392e2c9eef6132184afc1a9434ef43a0"))
( 167777, uint256("0x065a4f24bf79b45353e456e74a75194be95d49bb9fbe2ef0771b0ff4a2c86ac6"))
( 177777, uint256("0x221fff51bd7a48c300b863ff5edb82dacf893ea908c00c4f6915789bed0d27b2"))
( 187777, uint256("0xad3c4d3b73493cb9c888b30f550db4fda6a5ff8c6daa20404285e354c3a4c7a1"))
( 197777, uint256("0x2f9203c38cede6e1c61a69eb43c4f62d8e04ba62a57a7d866f4d715eec033cd4"))
( 207777, uint256("0x3631874522954f18e6834ad68918dcc060b2712fea7a381e11e10193c6ab2f24"))
( 217777, uint256("0xc86150e3408be1ca10eb1b94f374ae331c07dec6e2162bf06f5c31d83faa9715"))
;
static const CCheckpointData data = {
&mapCheckpoints,
1391536800, // * UNIX timestamp of last checkpoint block
356643, // * total number of transactions between genesis and last checkpoint
// (the tx=... number in the SetBestChain debug.log lines)
4500.0 // * estimated number of transactions per day after checkpoint
};
static MapCheckpoints mapCheckpointsTestnet =
boost::assign::map_list_of
( 0, uint256("0x"))
;
static const CCheckpointData dataTestnet = {
&mapCheckpointsTestnet,
1369685559,
37581,
300
};
const CCheckpointData &Checkpoints() {
if (fTestNet)
return dataTestnet;
else
return data;
}
bool CheckBlock(int nHeight, const uint256& hash)
{
if (fTestNet) return true; // Testnet has no checkpoints
if (!GetBoolArg("-checkpoints", true))
return true;
MapCheckpoints::const_iterator i = mapCheckpoints.find(nHeight);
if (i == mapCheckpoints.end()) return true;
const MapCheckpoints& checkpoints = *Checkpoints().mapCheckpoints;
MapCheckpoints::const_iterator i = checkpoints.find(nHeight);
if (i == checkpoints.end()) return true;
return hash == i->second;
}
// Guess how far we are in the verification process at the given block index
double GuessVerificationProgress(CBlockIndex *pindex) {
if (pindex==NULL)
return 0.0;
int64 nNow = time(NULL);
double fWorkBefore = 0.0; // Amount of work done before pindex
double fWorkAfter = 0.0; // Amount of work left after pindex (estimated)
// Work is defined as: 1.0 per transaction before the last checkoint, and
// fSigcheckVerificationFactor per transaction after.
const CCheckpointData &data = Checkpoints();
if (pindex->nChainTx <= data.nTransactionsLastCheckpoint) {
double nCheapBefore = pindex->nChainTx;
double nCheapAfter = data.nTransactionsLastCheckpoint - pindex->nChainTx;
double nExpensiveAfter = (nNow - data.nTimeLastCheckpoint)/86400.0*data.fTransactionsPerDay;
fWorkBefore = nCheapBefore;
fWorkAfter = nCheapAfter + nExpensiveAfter*fSigcheckVerificationFactor;
} else {
double nCheapBefore = data.nTransactionsLastCheckpoint;
double nExpensiveBefore = pindex->nChainTx - data.nTransactionsLastCheckpoint;
double nExpensiveAfter = (nNow - pindex->nTime)/86400.0*data.fTransactionsPerDay;
fWorkBefore = nCheapBefore + nExpensiveBefore*fSigcheckVerificationFactor;
fWorkAfter = nExpensiveAfter*fSigcheckVerificationFactor;
}
return fWorkBefore / (fWorkBefore + fWorkAfter);
}
int GetTotalBlocksEstimate()
{
if (fTestNet) return 0;
if (fTestNet) return 0; // Testnet has no checkpoints
if (!GetBoolArg("-checkpoints", true))
return 0;
return mapCheckpoints.rbegin()->first;
const MapCheckpoints& checkpoints = *Checkpoints().mapCheckpoints;
return checkpoints.rbegin()->first;
}
CBlockIndex* GetLastCheckpoint(const std::map<uint256, CBlockIndex*>& mapBlockIndex)
{
if (fTestNet) return NULL;
if (fTestNet) return NULL; // Testnet has no checkpoints
if (!GetBoolArg("-checkpoints", true))
return NULL;
BOOST_REVERSE_FOREACH(const MapCheckpoints::value_type& i, mapCheckpoints)
const MapCheckpoints& checkpoints = *Checkpoints().mapCheckpoints;
BOOST_REVERSE_FOREACH(const MapCheckpoints::value_type& i, checkpoints)
{
const uint256& hash = i.second;
std::map<uint256, CBlockIndex*>::const_iterator t = mapBlockIndex.find(hash);

View File

@@ -1,10 +1,8 @@
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_CHECKPOINT_H
#define BITCOIN_CHECKPOINT_H
#define BITCOIN_CHECKPOINT_H
#include <map>
@@ -24,6 +22,8 @@ namespace Checkpoints
// Returns last CBlockIndex* in mapBlockIndex that is a checkpoint
CBlockIndex* GetLastCheckpoint(const std::map<uint256, CBlockIndex*>& mapBlockIndex);
double GuessVerificationProgress(CBlockIndex *pindex);
}
#endif

192
src/checkqueue.h Normal file
View File

@@ -0,0 +1,192 @@
// Copyright (c) 2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef CHECKQUEUE_H
#define CHECKQUEUE_H
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread/condition_variable.hpp>
#include <vector>
#include <algorithm>
template<typename T> class CCheckQueueControl;
/** Queue for verifications that have to be performed.
* The verifications are represented by a type T, which must provide an
* operator(), returning a bool.
*
* One thread (the master) is assumed to push batches of verifications
* onto the queue, where they are processed by N-1 worker threads. When
* the master is done adding work, it temporarily joins the worker pool
* as an N'th worker, until all jobs are done.
*/
template<typename T> class CCheckQueue {
private:
// Mutex to protect the inner state
boost::mutex mutex;
// Worker threads block on this when out of work
boost::condition_variable condWorker;
// Master thread blocks on this when out of work
boost::condition_variable condMaster;
// The queue of elements to be processed.
// As the order of booleans doesn't matter, it is used as a LIFO (stack)
std::vector<T> queue;
// The number of workers (including the master) that are idle.
int nIdle;
// The total number of workers (including the master).
int nTotal;
// The temporary evaluation result.
bool fAllOk;
// Number of verifications that haven't completed yet.
// This includes elements that are not anymore in queue, but still in
// worker's own batches.
unsigned int nTodo;
// Whether we're shutting down.
bool fQuit;
// The maximum number of elements to be processed in one batch
unsigned int nBatchSize;
// Internal function that does bulk of the verification work.
bool Loop(bool fMaster = false) {
boost::condition_variable &cond = fMaster ? condMaster : condWorker;
std::vector<T> vChecks;
vChecks.reserve(nBatchSize);
unsigned int nNow = 0;
bool fOk = true;
do {
{
boost::unique_lock<boost::mutex> lock(mutex);
// first do the clean-up of the previous loop run (allowing us to do it in the same critsect)
if (nNow) {
fAllOk &= fOk;
nTodo -= nNow;
if (nTodo == 0 && !fMaster)
// We processed the last element; inform the master he can exit and return the result
condMaster.notify_one();
} else {
// first iteration
nTotal++;
}
// logically, the do loop starts here
while (queue.empty()) {
if ((fMaster || fQuit) && nTodo == 0) {
nTotal--;
bool fRet = fAllOk;
// reset the status for new work later
if (fMaster)
fAllOk = true;
// return the current status
return fRet;
}
nIdle++;
cond.wait(lock); // wait
nIdle--;
}
// Decide how many work units to process now.
// * Do not try to do everything at once, but aim for increasingly smaller batches so
// all workers finish approximately simultaneously.
// * Try to account for idle jobs which will instantly start helping.
// * Don't do batches smaller than 1 (duh), or larger than nBatchSize.
nNow = std::max(1U, std::min(nBatchSize, (unsigned int)queue.size() / (nTotal + nIdle + 1)));
vChecks.resize(nNow);
for (unsigned int i = 0; i < nNow; i++) {
// We want the lock on the mutex to be as short as possible, so swap jobs from the global
// queue to the local batch vector instead of copying.
vChecks[i].swap(queue.back());
queue.pop_back();
}
// Check whether we need to do work at all
fOk = fAllOk;
}
// execute work
BOOST_FOREACH(T &check, vChecks)
if (fOk)
fOk = check();
vChecks.clear();
} while(true);
}
public:
// Create a new check queue
CCheckQueue(unsigned int nBatchSizeIn) :
nIdle(0), nTotal(0), fAllOk(true), nTodo(0), fQuit(false), nBatchSize(nBatchSizeIn) {}
// Worker thread
void Thread() {
Loop();
}
// Wait until execution finishes, and return whether all evaluations where succesful.
bool Wait() {
return Loop(true);
}
// Add a batch of checks to the queue
void Add(std::vector<T> &vChecks) {
boost::unique_lock<boost::mutex> lock(mutex);
BOOST_FOREACH(T &check, vChecks) {
queue.push_back(T());
check.swap(queue.back());
}
nTodo += vChecks.size();
if (vChecks.size() == 1)
condWorker.notify_one();
else if (vChecks.size() > 1)
condWorker.notify_all();
}
~CCheckQueue() {
}
friend class CCheckQueueControl<T>;
};
/** RAII-style controller object for a CCheckQueue that guarantees the passed
* queue is finished before continuing.
*/
template<typename T> class CCheckQueueControl {
private:
CCheckQueue<T> *pqueue;
bool fDone;
public:
CCheckQueueControl(CCheckQueue<T> *pqueueIn) : pqueue(pqueueIn), fDone(false) {
// passed queue is supposed to be unused, or NULL
if (pqueue != NULL) {
assert(pqueue->nTotal == pqueue->nIdle);
assert(pqueue->nTodo == 0);
assert(pqueue->fAllOk == true);
}
}
bool Wait() {
if (pqueue == NULL)
return true;
bool fRet = pqueue->Wait();
fDone = true;
return fRet;
}
void Add(std::vector<T> &vChecks) {
if (pqueue != NULL)
pqueue->Add(vChecks);
}
~CCheckQueueControl() {
if (!fDone)
Wait();
}
};
#endif

26
src/clientversion.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef CLIENTVERSION_H
#define CLIENTVERSION_H
//
// client versioning and copyright year
//
// These need to be macros, as version.cpp's and bitcoin-qt.rc's voodoo requires it
#define CLIENT_VERSION_MAJOR 1
#define CLIENT_VERSION_MINOR 1
#define CLIENT_VERSION_REVISION 0
#define CLIENT_VERSION_BUILD 0
// Set to true for release, false for prerelease or test build
#define CLIENT_VERSION_IS_RELEASE true
// Copyright year (2009-this)
// Todo: update this when changing our copyright comments in the source
#define COPYRIGHT_YEAR 2014
// Converts the parameter X to a string after macro replacement on X has been performed.
// Don't merge these into one macro!
#define STRINGIZE(X) DO_STRINGIZE(X)
#define DO_STRINGIZE(X) #X
#endif // CLIENTVERSION_H

57
src/coincontrol.h Normal file
View File

@@ -0,0 +1,57 @@
#ifndef COINCONTROL_H
#define COINCONTROL_H
/** Coin Control Features. */
class CCoinControl
{
public:
CTxDestination destChange;
CCoinControl()
{
SetNull();
}
void SetNull()
{
destChange = CNoDestination();
setSelected.clear();
}
bool HasSelected() const
{
return (setSelected.size() > 0);
}
bool IsSelected(const uint256& hash, unsigned int n) const
{
COutPoint outpt(hash, n);
return (setSelected.count(outpt) > 0);
}
void Select(COutPoint& output)
{
setSelected.insert(output);
}
void UnSelect(COutPoint& output)
{
setSelected.erase(output);
}
void UnSelectAll()
{
setSelected.clear();
}
void ListSelected(std::vector<COutPoint>& vOutpoints)
{
vOutpoints.assign(setSelected.begin(), setSelected.end());
}
private:
std::set<COutPoint> setSelected;
};
#endif // COINCONTROL_H

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef _BITCOIN_COMPAT_H
@@ -13,6 +11,7 @@
#ifndef NOMINMAX
#define NOMINMAX
#endif
#define FD_SETSIZE 1024 // max number of fds in fd_set
#include <winsock2.h>
#include <mswsock.h>
#include <ws2tcpip.h>

View File

@@ -1,6 +1,4 @@
// Copyright (c) 2009-2012 The Bitcoin Developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
@@ -19,12 +17,6 @@ bool CCrypter::SetKeyFromPassphrase(const SecureString& strKeyData, const std::v
if (nRounds < 1 || chSalt.size() != WALLET_CRYPTO_SALT_SIZE)
return false;
// Try to keep the keydata out of swap (and be a bit over-careful to keep the IV that we don't even use out of swap)
// Note that this does nothing about suspend-to-disk (which will put all our key data on disk)
// Note as well that at no point in this program is any attempt made to prevent stealing of keys by reading the memory of the running process.
mlock(&chKey[0], sizeof chKey);
mlock(&chIV[0], sizeof chIV);
int i = 0;
if (nDerivationMethod == 0)
i = EVP_BytesToKey(EVP_aes_256_cbc(), EVP_sha512(), &chSalt[0],
@@ -32,8 +24,8 @@ bool CCrypter::SetKeyFromPassphrase(const SecureString& strKeyData, const std::v
if (i != (int)WALLET_CRYPTO_KEY_SIZE)
{
memset(&chKey, 0, sizeof chKey);
memset(&chIV, 0, sizeof chIV);
OPENSSL_cleanse(chKey, sizeof(chKey));
OPENSSL_cleanse(chIV, sizeof(chIV));
return false;
}
@@ -46,12 +38,6 @@ bool CCrypter::SetKey(const CKeyingMaterial& chNewKey, const std::vector<unsigne
if (chNewKey.size() != WALLET_CRYPTO_KEY_SIZE || chNewIV.size() != WALLET_CRYPTO_KEY_SIZE)
return false;
// Try to keep the keydata out of swap
// Note that this does nothing about suspend-to-disk (which will put all our key data on disk)
// Note as well that at no point in this program is any attempt made to prevent stealing of keys by reading the memory of the running process.
mlock(&chKey[0], sizeof chKey);
mlock(&chIV[0], sizeof chIV);
memcpy(&chKey[0], &chNewKey[0], sizeof chKey);
memcpy(&chIV[0], &chNewIV[0], sizeof chIV);
@@ -114,17 +100,17 @@ bool CCrypter::Decrypt(const std::vector<unsigned char>& vchCiphertext, CKeyingM
}
bool EncryptSecret(CKeyingMaterial& vMasterKey, const CSecret &vchPlaintext, const uint256& nIV, std::vector<unsigned char> &vchCiphertext)
bool EncryptSecret(const CKeyingMaterial& vMasterKey, const CKeyingMaterial &vchPlaintext, const uint256& nIV, std::vector<unsigned char> &vchCiphertext)
{
CCrypter cKeyCrypter;
std::vector<unsigned char> chIV(WALLET_CRYPTO_KEY_SIZE);
memcpy(&chIV[0], &nIV, WALLET_CRYPTO_KEY_SIZE);
if(!cKeyCrypter.SetKey(vMasterKey, chIV))
return false;
return cKeyCrypter.Encrypt((CKeyingMaterial)vchPlaintext, vchCiphertext);
return cKeyCrypter.Encrypt(*((const CKeyingMaterial*)&vchPlaintext), vchCiphertext);
}
bool DecryptSecret(const CKeyingMaterial& vMasterKey, const std::vector<unsigned char>& vchCiphertext, const uint256& nIV, CSecret& vchPlaintext)
bool DecryptSecret(const CKeyingMaterial& vMasterKey, const std::vector<unsigned char>& vchCiphertext, const uint256& nIV, CKeyingMaterial& vchPlaintext)
{
CCrypter cKeyCrypter;
std::vector<unsigned char> chIV(WALLET_CRYPTO_KEY_SIZE);

View File

@@ -1,6 +1,4 @@
// Copyright (c) 2009-2012 The Bitcoin Developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef __CRYPTER_H__
@@ -78,25 +76,32 @@ public:
void CleanKey()
{
memset(&chKey, 0, sizeof chKey);
memset(&chIV, 0, sizeof chIV);
munlock(&chKey, sizeof chKey);
munlock(&chIV, sizeof chIV);
OPENSSL_cleanse(chKey, sizeof(chKey));
OPENSSL_cleanse(chIV, sizeof(chIV));
fKeySet = false;
}
CCrypter()
{
fKeySet = false;
// Try to keep the key data out of swap (and be a bit over-careful to keep the IV that we don't even use out of swap)
// Note that this does nothing about suspend-to-disk (which will put all our key data on disk)
// Note as well that at no point in this program is any attempt made to prevent stealing of keys by reading the memory of the running process.
LockedPageManager::instance.LockRange(&chKey[0], sizeof chKey);
LockedPageManager::instance.LockRange(&chIV[0], sizeof chIV);
}
~CCrypter()
{
CleanKey();
LockedPageManager::instance.UnlockRange(&chKey[0], sizeof chKey);
LockedPageManager::instance.UnlockRange(&chIV[0], sizeof chIV);
}
};
bool EncryptSecret(CKeyingMaterial& vMasterKey, const CSecret &vchPlaintext, const uint256& nIV, std::vector<unsigned char> &vchCiphertext);
bool DecryptSecret(const CKeyingMaterial& vMasterKey, const std::vector<unsigned char> &vchCiphertext, const uint256& nIV, CSecret &vchPlaintext);
bool EncryptSecret(const CKeyingMaterial& vMasterKey, const CKeyingMaterial &vchPlaintext, const uint256& nIV, std::vector<unsigned char> &vchCiphertext);
bool DecryptSecret(const CKeyingMaterial& vMasterKey, const std::vector<unsigned char>& vchCiphertext, const uint256& nIV, CKeyingMaterial& vchPlaintext);
#endif

View File

@@ -1,15 +1,11 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include "db.h"
#include "util.h"
#include "main.h"
#include "init.h"
#include <boost/version.hpp>
#include <boost/filesystem.hpp>
#include <boost/filesystem/fstream.hpp>
@@ -37,19 +33,17 @@ void CDBEnv::EnvShutdown()
return;
fDbEnvInit = false;
try
{
dbenv.close(0);
}
catch (const DbException& e)
{
printf("EnvShutdown exception: %s (%d)\n", e.what(), e.get_errno());
}
DbEnv(0).remove(GetDataDir().string().c_str(), 0);
int ret = dbenv.close(0);
if (ret != 0)
printf("EnvShutdown exception: %s (%d)\n", DbEnv::strerror(ret), ret);
if (!fMockDb)
DbEnv(0).remove(path.string().c_str(), 0);
}
CDBEnv::CDBEnv() : dbenv(0)
CDBEnv::CDBEnv() : dbenv(DB_CXX_NO_EXCEPTIONS)
{
fDbEnvInit = false;
fMockDb = false;
}
CDBEnv::~CDBEnv()
@@ -62,37 +56,34 @@ void CDBEnv::Close()
EnvShutdown();
}
bool CDBEnv::Open(boost::filesystem::path pathEnv_)
bool CDBEnv::Open(const boost::filesystem::path& pathIn)
{
if (fDbEnvInit)
return true;
if (fShutdown)
return false;
boost::this_thread::interruption_point();
pathEnv = pathEnv_;
filesystem::path pathDataDir = pathEnv;
filesystem::path pathLogDir = pathDataDir / "database";
path = pathIn;
filesystem::path pathLogDir = path / "database";
filesystem::create_directory(pathLogDir);
filesystem::path pathErrorFile = pathDataDir / "db.log";
filesystem::path pathErrorFile = path / "db.log";
printf("dbenv.open LogDir=%s ErrorFile=%s\n", pathLogDir.string().c_str(), pathErrorFile.string().c_str());
unsigned int nEnvFlags = 0;
if (GetBoolArg("-privdb", true))
nEnvFlags |= DB_PRIVATE;
int nDbCache = GetArg("-dbcache", 25);
dbenv.set_lg_dir(pathLogDir.string().c_str());
dbenv.set_cachesize(nDbCache / 1024, (nDbCache % 1024)*1048576, 1);
dbenv.set_lg_bsize(1048576);
dbenv.set_lg_max(10485760);
dbenv.set_lk_max_locks(537000);
dbenv.set_lk_max_objects(10000);
dbenv.set_cachesize(0, 0x100000, 1); // 1 MiB should be enough for just the wallet
dbenv.set_lg_bsize(0x10000);
dbenv.set_lg_max(1048576);
dbenv.set_lk_max_locks(40000);
dbenv.set_lk_max_objects(40000);
dbenv.set_errfile(fopen(pathErrorFile.string().c_str(), "a")); /// debug
dbenv.set_flags(DB_AUTO_COMMIT, 1);
dbenv.set_flags(DB_TXN_WRITE_NOSYNC, 1);
dbenv.log_set_config(DB_LOG_AUTO_REMOVE, 1);
int ret = dbenv.open(pathDataDir.string().c_str(),
int ret = dbenv.open(path.string().c_str(),
DB_CREATE |
DB_INIT_LOCK |
DB_INIT_LOG |
@@ -102,38 +93,123 @@ bool CDBEnv::Open(boost::filesystem::path pathEnv_)
DB_RECOVER |
nEnvFlags,
S_IRUSR | S_IWUSR);
if (ret > 0)
return error("CDB() : error %d opening database environment", ret);
// Check that the number of locks is sufficient
u_int32_t nMaxLocks;
if (!dbenv.get_lk_max_locks(&nMaxLocks))
{
int nBlocks, nDeepReorg;
std::string strMessage;
nBlocks = nMaxLocks / 48768;
nDeepReorg = (nBlocks - 1) / 2;
printf("Final lk_max_locks is %lu, sufficient for (worst case) %d block%s in a single transaction (up to a %d-deep reorganization)\n", (unsigned long)nMaxLocks, nBlocks, (nBlocks == 1) ? "" : "s", nDeepReorg);
if (nDeepReorg < 3)
{
if (nBlocks < 1)
strMessage = strprintf(_("Warning: DB_CONFIG has set_lk_max_locks %lu, which may be too low for a single block. If this limit is reached, Bitcoin may stop working."), (unsigned long)nMaxLocks);
else
strMessage = strprintf(_("Warning: DB_CONFIG has set_lk_max_locks %lu, which may be too low for a common blockchain reorganization. If this limit is reached, Bitcoin may stop working."), (unsigned long)nMaxLocks);
strMiscWarning = strMessage;
printf("*** %s\n", strMessage.c_str());
}
}
if (ret != 0)
return error("CDB() : error %s (%d) opening database environment", DbEnv::strerror(ret), ret);
fDbEnvInit = true;
fMockDb = false;
return true;
}
void CDBEnv::MakeMock()
{
if (fDbEnvInit)
throw runtime_error("CDBEnv::MakeMock(): already initialized");
boost::this_thread::interruption_point();
printf("CDBEnv::MakeMock()\n");
dbenv.set_cachesize(1, 0, 1);
dbenv.set_lg_bsize(10485760*4);
dbenv.set_lg_max(10485760);
dbenv.set_lk_max_locks(10000);
dbenv.set_lk_max_objects(10000);
dbenv.set_flags(DB_AUTO_COMMIT, 1);
dbenv.log_set_config(DB_LOG_IN_MEMORY, 1);
int ret = dbenv.open(NULL,
DB_CREATE |
DB_INIT_LOCK |
DB_INIT_LOG |
DB_INIT_MPOOL |
DB_INIT_TXN |
DB_THREAD |
DB_PRIVATE,
S_IRUSR | S_IWUSR);
if (ret > 0)
throw runtime_error(strprintf("CDBEnv::MakeMock(): error %d opening database environment", ret));
fDbEnvInit = true;
fMockDb = true;
}
CDBEnv::VerifyResult CDBEnv::Verify(std::string strFile, bool (*recoverFunc)(CDBEnv& dbenv, std::string strFile))
{
LOCK(cs_db);
assert(mapFileUseCount.count(strFile) == 0);
Db db(&dbenv, 0);
int result = db.verify(strFile.c_str(), NULL, NULL, 0);
if (result == 0)
return VERIFY_OK;
else if (recoverFunc == NULL)
return RECOVER_FAIL;
// Try to recover:
bool fRecovered = (*recoverFunc)(*this, strFile);
return (fRecovered ? RECOVER_OK : RECOVER_FAIL);
}
bool CDBEnv::Salvage(std::string strFile, bool fAggressive,
std::vector<CDBEnv::KeyValPair >& vResult)
{
LOCK(cs_db);
assert(mapFileUseCount.count(strFile) == 0);
u_int32_t flags = DB_SALVAGE;
if (fAggressive) flags |= DB_AGGRESSIVE;
stringstream strDump;
Db db(&dbenv, 0);
int result = db.verify(strFile.c_str(), NULL, &strDump, flags);
if (result == DB_VERIFY_BAD)
{
printf("Error: Salvage found errors, all data may not be recoverable.\n");
if (!fAggressive)
{
printf("Error: Rerun with aggressive mode to ignore errors and continue.\n");
return false;
}
}
if (result != 0 && result != DB_VERIFY_BAD)
{
printf("ERROR: db salvage failed: %d\n",result);
return false;
}
// Format of bdb dump is ascii lines:
// header lines...
// HEADER=END
// hexadecimal key
// hexadecimal value
// ... repeated
// DATA=END
string strLine;
while (!strDump.eof() && strLine != "HEADER=END")
getline(strDump, strLine); // Skip past header
std::string keyHex, valueHex;
while (!strDump.eof() && keyHex != "DATA=END")
{
getline(strDump, keyHex);
if (keyHex != "DATA_END")
{
getline(strDump, valueHex);
vResult.push_back(make_pair(ParseHex(keyHex),ParseHex(valueHex)));
}
}
return (result == 0);
}
void CDBEnv::CheckpointLSN(std::string strFile)
{
dbenv.txn_checkpoint(0, 0, 0);
if (fMockDb)
return;
dbenv.lsn_reset(strFile.c_str(), 0);
}
@@ -163,21 +239,27 @@ CDB::CDB(const char *pszFile, const char* pszMode) :
{
pdb = new Db(&bitdb.dbenv, 0);
bool fMockDb = bitdb.IsMock();
if (fMockDb)
{
DbMpoolFile*mpf = pdb->get_mpf();
ret = mpf->set_flags(DB_MPOOL_NOFILE, 1);
if (ret != 0)
throw runtime_error(strprintf("CDB() : failed to configure for no temp file backing for database %s", pszFile));
}
ret = pdb->open(NULL, // Txn pointer
pszFile, // Filename
"main", // Logical db name
fMockDb ? NULL : pszFile, // Filename
fMockDb ? pszFile : "main", // Logical db name
DB_BTREE, // Database type
nFlags, // Flags
0);
if (ret > 0)
if (ret != 0)
{
delete pdb;
pdb = NULL;
{
LOCK(bitdb.cs_db);
--bitdb.mapFileUseCount[strFile];
}
--bitdb.mapFileUseCount[strFile];
strFile = "";
throw runtime_error(strprintf("CDB() : can't open database file %s, error %d", pszFile, ret));
}
@@ -195,12 +277,17 @@ CDB::CDB(const char *pszFile, const char* pszMode) :
}
}
static bool IsChainFile(std::string strFile)
void CDB::Flush()
{
if (strFile == "blkindex.dat")
return true;
if (activeTxn)
return;
return false;
// Flush database activity from memory pool to disk log
unsigned int nMinutes = 0;
if (fReadOnly)
nMinutes = 1;
bitdb.dbenv.txn_checkpoint(nMinutes ? GetArg("-dblogsize", 100)*1024 : 0, nMinutes, 0);
}
void CDB::Close()
@@ -212,16 +299,7 @@ void CDB::Close()
activeTxn = NULL;
pdb = NULL;
// Flush database activity from memory pool to disk log
unsigned int nMinutes = 0;
if (fReadOnly)
nMinutes = 1;
if (IsChainFile(strFile))
nMinutes = 2;
if (IsChainFile(strFile) && IsInitialBlockDownload())
nMinutes = 5;
bitdb.dbenv.txn_checkpoint(nMinutes ? GetArg("-dblogsize", 100)*1024 : 0, nMinutes, 0);
Flush();
{
LOCK(bitdb.cs_db);
@@ -244,9 +322,18 @@ void CDBEnv::CloseDb(const string& strFile)
}
}
bool CDBEnv::RemoveDb(const string& strFile)
{
this->CloseDb(strFile);
LOCK(cs_db);
int rc = dbenv.dbremove(NULL, strFile.c_str(), NULL, DB_AUTO_COMMIT);
return (rc == 0);
}
bool CDB::Rewrite(const string& strFile, const char* pszSkip)
{
while (!fShutdown)
while (true)
{
{
LOCK(bitdb.cs_db);
@@ -263,7 +350,7 @@ bool CDB::Rewrite(const string& strFile, const char* pszSkip)
{ // surround usage of db with extra {}
CDB db(strFile.c_str(), "r");
Db* pdbCopy = new Db(&bitdb.dbenv, 0);
int ret = pdbCopy->open(NULL, // Txn pointer
strFileRes.c_str(), // Filename
"main", // Logical db name
@@ -275,7 +362,7 @@ bool CDB::Rewrite(const string& strFile, const char* pszSkip)
printf("Cannot create database file %s\n", strFileRes.c_str());
fSuccess = false;
}
Dbc* pcursor = db.GetCursor();
if (pcursor)
while (fSuccess)
@@ -332,7 +419,7 @@ bool CDB::Rewrite(const string& strFile, const char* pszSkip)
return fSuccess;
}
}
Sleep(100);
MilliSleep(100);
}
return false;
}
@@ -360,10 +447,9 @@ void CDBEnv::Flush(bool fShutdown)
CloseDb(strFile);
printf("%s checkpoint\n", strFile.c_str());
dbenv.txn_checkpoint(0, 0, 0);
if (!IsChainFile(strFile) || fDetachDB) {
printf("%s detach\n", strFile.c_str());
printf("%s detach\n", strFile.c_str());
if (!fMockDb)
dbenv.lsn_reset(strFile.c_str(), 0);
}
printf("%s closed\n", strFile.c_str());
mapFileUseCount.erase(mi++);
}
@@ -378,6 +464,8 @@ void CDBEnv::Flush(bool fShutdown)
{
dbenv.log_archive(&listp, DB_ARCH_REMOVE);
Close();
if (!fMockDb)
boost::filesystem::remove_all(path / "database");
}
}
}
@@ -388,356 +476,6 @@ void CDBEnv::Flush(bool fShutdown)
//
// CTxDB
//
bool CTxDB::ReadTxIndex(uint256 hash, CTxIndex& txindex)
{
assert(!fClient);
txindex.SetNull();
return Read(make_pair(string("tx"), hash), txindex);
}
bool CTxDB::UpdateTxIndex(uint256 hash, const CTxIndex& txindex)
{
assert(!fClient);
return Write(make_pair(string("tx"), hash), txindex);
}
bool CTxDB::AddTxIndex(const CTransaction& tx, const CDiskTxPos& pos, int nHeight)
{
assert(!fClient);
// Add to tx index
uint256 hash = tx.GetHash();
CTxIndex txindex(pos, tx.vout.size());
return Write(make_pair(string("tx"), hash), txindex);
}
bool CTxDB::EraseTxIndex(const CTransaction& tx)
{
assert(!fClient);
uint256 hash = tx.GetHash();
return Erase(make_pair(string("tx"), hash));
}
bool CTxDB::ContainsTx(uint256 hash)
{
assert(!fClient);
return Exists(make_pair(string("tx"), hash));
}
bool CTxDB::ReadDiskTx(uint256 hash, CTransaction& tx, CTxIndex& txindex)
{
assert(!fClient);
tx.SetNull();
if (!ReadTxIndex(hash, txindex))
return false;
return (tx.ReadFromDisk(txindex.pos));
}
bool CTxDB::ReadDiskTx(uint256 hash, CTransaction& tx)
{
CTxIndex txindex;
return ReadDiskTx(hash, tx, txindex);
}
bool CTxDB::ReadDiskTx(COutPoint outpoint, CTransaction& tx, CTxIndex& txindex)
{
return ReadDiskTx(outpoint.hash, tx, txindex);
}
bool CTxDB::ReadDiskTx(COutPoint outpoint, CTransaction& tx)
{
CTxIndex txindex;
return ReadDiskTx(outpoint.hash, tx, txindex);
}
bool CTxDB::WriteBlockIndex(const CDiskBlockIndex& blockindex)
{
return Write(make_pair(string("blockindex"), blockindex.GetBlockHash()), blockindex);
}
bool CTxDB::ReadHashBestChain(uint256& hashBestChain)
{
return Read(string("hashBestChain"), hashBestChain);
}
bool CTxDB::WriteHashBestChain(uint256 hashBestChain)
{
return Write(string("hashBestChain"), hashBestChain);
}
bool CTxDB::ReadBestInvalidWork(CBigNum& bnBestInvalidWork)
{
return Read(string("bnBestInvalidWork"), bnBestInvalidWork);
}
bool CTxDB::WriteBestInvalidWork(CBigNum bnBestInvalidWork)
{
return Write(string("bnBestInvalidWork"), bnBestInvalidWork);
}
CBlockIndex static * InsertBlockIndex(uint256 hash)
{
if (hash == 0)
return NULL;
// Return existing
map<uint256, CBlockIndex*>::iterator mi = mapBlockIndex.find(hash);
if (mi != mapBlockIndex.end())
return (*mi).second;
// Create new
CBlockIndex* pindexNew = new CBlockIndex();
if (!pindexNew)
throw runtime_error("LoadBlockIndex() : new CBlockIndex failed");
mi = mapBlockIndex.insert(make_pair(hash, pindexNew)).first;
pindexNew->phashBlock = &((*mi).first);
return pindexNew;
}
bool CTxDB::LoadBlockIndex()
{
if (!LoadBlockIndexGuts())
return false;
if (fRequestShutdown)
return true;
// Calculate bnChainWork
vector<pair<int, CBlockIndex*> > vSortedByHeight;
vSortedByHeight.reserve(mapBlockIndex.size());
BOOST_FOREACH(const PAIRTYPE(uint256, CBlockIndex*)& item, mapBlockIndex)
{
CBlockIndex* pindex = item.second;
vSortedByHeight.push_back(make_pair(pindex->nHeight, pindex));
}
sort(vSortedByHeight.begin(), vSortedByHeight.end());
BOOST_FOREACH(const PAIRTYPE(int, CBlockIndex*)& item, vSortedByHeight)
{
CBlockIndex* pindex = item.second;
pindex->bnChainWork = (pindex->pprev ? pindex->pprev->bnChainWork : 0) + pindex->GetBlockWork();
}
// Load hashBestChain pointer to end of best chain
if (!ReadHashBestChain(hashBestChain))
{
if (pindexGenesisBlock == NULL)
return true;
return error("CTxDB::LoadBlockIndex() : hashBestChain not loaded");
}
if (!mapBlockIndex.count(hashBestChain))
return error("CTxDB::LoadBlockIndex() : hashBestChain not found in the block index");
pindexBest = mapBlockIndex[hashBestChain];
nBestHeight = pindexBest->nHeight;
bnBestChainWork = pindexBest->bnChainWork;
printf("LoadBlockIndex(): hashBestChain=%s height=%d date=%s\n",
hashBestChain.ToString().substr(0,20).c_str(), nBestHeight,
DateTimeStrFormat("%x %H:%M:%S", pindexBest->GetBlockTime()).c_str());
// Load bnBestInvalidWork, OK if it doesn't exist
ReadBestInvalidWork(bnBestInvalidWork);
// Verify blocks in the best chain
int nCheckLevel = GetArg("-checklevel", 1);
int nCheckDepth = GetArg( "-checkblocks", 2500);
if (nCheckDepth == 0)
nCheckDepth = 1000000000; // suffices until the year 19000
if (nCheckDepth > nBestHeight)
nCheckDepth = nBestHeight;
printf("Verifying last %i blocks at level %i\n", nCheckDepth, nCheckLevel);
CBlockIndex* pindexFork = NULL;
map<pair<unsigned int, unsigned int>, CBlockIndex*> mapBlockPos;
for (CBlockIndex* pindex = pindexBest; pindex && pindex->pprev; pindex = pindex->pprev)
{
if (fRequestShutdown || pindex->nHeight < nBestHeight-nCheckDepth)
break;
CBlock block;
if (!block.ReadFromDisk(pindex))
return error("LoadBlockIndex() : block.ReadFromDisk failed");
// check level 1: verify block validity
if (nCheckLevel>0 && !block.CheckBlock())
{
printf("LoadBlockIndex() : *** found bad block at %d, hash=%s\n", pindex->nHeight, pindex->GetBlockHash().ToString().c_str());
pindexFork = pindex->pprev;
}
// check level 2: verify transaction index validity
if (nCheckLevel>1)
{
pair<unsigned int, unsigned int> pos = make_pair(pindex->nFile, pindex->nBlockPos);
mapBlockPos[pos] = pindex;
BOOST_FOREACH(const CTransaction &tx, block.vtx)
{
uint256 hashTx = tx.GetHash();
CTxIndex txindex;
if (ReadTxIndex(hashTx, txindex))
{
// check level 3: checker transaction hashes
if (nCheckLevel>2 || pindex->nFile != txindex.pos.nFile || pindex->nBlockPos != txindex.pos.nBlockPos)
{
// either an error or a duplicate transaction
CTransaction txFound;
if (!txFound.ReadFromDisk(txindex.pos))
{
printf("LoadBlockIndex() : *** cannot read mislocated transaction %s\n", hashTx.ToString().c_str());
pindexFork = pindex->pprev;
}
else
if (txFound.GetHash() != hashTx) // not a duplicate tx
{
printf("LoadBlockIndex(): *** invalid tx position for %s\n", hashTx.ToString().c_str());
pindexFork = pindex->pprev;
}
}
// check level 4: check whether spent txouts were spent within the main chain
unsigned int nOutput = 0;
if (nCheckLevel>3)
{
BOOST_FOREACH(const CDiskTxPos &txpos, txindex.vSpent)
{
if (!txpos.IsNull())
{
pair<unsigned int, unsigned int> posFind = make_pair(txpos.nFile, txpos.nBlockPos);
if (!mapBlockPos.count(posFind))
{
printf("LoadBlockIndex(): *** found bad spend at %d, hashBlock=%s, hashTx=%s\n", pindex->nHeight, pindex->GetBlockHash().ToString().c_str(), hashTx.ToString().c_str());
pindexFork = pindex->pprev;
}
// check level 6: check whether spent txouts were spent by a valid transaction that consume them
if (nCheckLevel>5)
{
CTransaction txSpend;
if (!txSpend.ReadFromDisk(txpos))
{
printf("LoadBlockIndex(): *** cannot read spending transaction of %s:%i from disk\n", hashTx.ToString().c_str(), nOutput);
pindexFork = pindex->pprev;
}
else if (!txSpend.CheckTransaction())
{
printf("LoadBlockIndex(): *** spending transaction of %s:%i is invalid\n", hashTx.ToString().c_str(), nOutput);
pindexFork = pindex->pprev;
}
else
{
bool fFound = false;
BOOST_FOREACH(const CTxIn &txin, txSpend.vin)
if (txin.prevout.hash == hashTx && txin.prevout.n == nOutput)
fFound = true;
if (!fFound)
{
printf("LoadBlockIndex(): *** spending transaction of %s:%i does not spend it\n", hashTx.ToString().c_str(), nOutput);
pindexFork = pindex->pprev;
}
}
}
}
nOutput++;
}
}
}
// check level 5: check whether all prevouts are marked spent
if (nCheckLevel>4)
{
BOOST_FOREACH(const CTxIn &txin, tx.vin)
{
CTxIndex txindex;
if (ReadTxIndex(txin.prevout.hash, txindex))
if (txindex.vSpent.size()-1 < txin.prevout.n || txindex.vSpent[txin.prevout.n].IsNull())
{
printf("LoadBlockIndex(): *** found unspent prevout %s:%i in %s\n", txin.prevout.hash.ToString().c_str(), txin.prevout.n, hashTx.ToString().c_str());
pindexFork = pindex->pprev;
}
}
}
}
}
}
if (pindexFork && !fRequestShutdown)
{
// Reorg back to the fork
printf("LoadBlockIndex() : *** moving best chain pointer back to block %d\n", pindexFork->nHeight);
CBlock block;
if (!block.ReadFromDisk(pindexFork))
return error("LoadBlockIndex() : block.ReadFromDisk failed");
CTxDB txdb;
block.SetBestChain(txdb, pindexFork);
}
return true;
}
bool CTxDB::LoadBlockIndexGuts()
{
// Get database cursor
Dbc* pcursor = GetCursor();
if (!pcursor)
return false;
// Load mapBlockIndex
unsigned int fFlags = DB_SET_RANGE;
loop
{
// Read next record
CDataStream ssKey(SER_DISK, CLIENT_VERSION);
if (fFlags == DB_SET_RANGE)
ssKey << make_pair(string("blockindex"), uint256(0));
CDataStream ssValue(SER_DISK, CLIENT_VERSION);
int ret = ReadAtCursor(pcursor, ssKey, ssValue, fFlags);
fFlags = DB_NEXT;
if (ret == DB_NOTFOUND)
break;
else if (ret != 0)
return false;
// Unserialize
try {
string strType;
ssKey >> strType;
if (strType == "blockindex" && !fRequestShutdown)
{
CDiskBlockIndex diskindex;
ssValue >> diskindex;
// Construct block index object
CBlockIndex* pindexNew = InsertBlockIndex(diskindex.GetBlockHash());
pindexNew->pprev = InsertBlockIndex(diskindex.hashPrev);
pindexNew->pnext = InsertBlockIndex(diskindex.hashNext);
pindexNew->nFile = diskindex.nFile;
pindexNew->nBlockPos = diskindex.nBlockPos;
pindexNew->nHeight = diskindex.nHeight;
pindexNew->nVersion = diskindex.nVersion;
pindexNew->hashMerkleRoot = diskindex.hashMerkleRoot;
pindexNew->nTime = diskindex.nTime;
pindexNew->nBits = diskindex.nBits;
pindexNew->nNonce = diskindex.nNonce;
// Watch for genesis block
if (pindexGenesisBlock == NULL && diskindex.GetBlockHash() == hashGenesisBlock)
pindexGenesisBlock = pindexNew;
if (!pindexNew->CheckIndex())
return error("LoadBlockIndex() : CheckIndex failed at %d", pindexNew->nHeight);
}
else
{
break; // if shutdown requested or finished loading block index
}
} // try
catch (std::exception &e) {
return error("%s() : deserialize error", __PRETTY_FUNCTION__);
}
}
pcursor->close();
return true;
}
@@ -802,6 +540,8 @@ bool CAddrDB::Read(CAddrMan& addr)
// use file size to size memory buffer
int fileSize = GetFilesize(filein);
int dataSize = fileSize - sizeof(uint256);
//Don't try to resize to a negative number if file is small
if ( dataSize < 0 ) dataSize = 0;
vector<unsigned char> vchData;
vchData.resize(dataSize);
uint256 hashIn;
@@ -823,20 +563,22 @@ bool CAddrDB::Read(CAddrMan& addr)
if (hashIn != hashTmp)
return error("CAddrman::Read() : checksum mismatch; data corrupted");
// de-serialize address data
unsigned char pchMsgTmp[4];
try {
// de-serialize file header (pchMessageStart magic number) and
ssPeers >> FLATDATA(pchMsgTmp);
// verify the network matches ours
if (memcmp(pchMsgTmp, pchMessageStart, sizeof(pchMsgTmp)))
return error("CAddrman::Read() : invalid network magic number");
// de-serialize address data into one CAddrMan object
ssPeers >> addr;
}
catch (std::exception &e) {
return error("CAddrman::Read() : I/O error or stream data corrupted");
}
// finally, verify the network matches ours
if (memcmp(pchMsgTmp, pchMessageStart, sizeof(pchMsgTmp)))
return error("CAddrman::Read() : invalid network magic number");
return true;
}

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_DB_H
@@ -19,25 +17,23 @@ class CAddress;
class CAddrMan;
class CBlockLocator;
class CDiskBlockIndex;
class CDiskTxPos;
class CMasterKey;
class COutPoint;
class CTxIndex;
class CWallet;
class CWalletTx;
extern unsigned int nWalletDBUpdated;
void ThreadFlushWalletDB(void* parg);
void ThreadFlushWalletDB(const std::string& strWalletFile);
bool BackupWallet(const CWallet& wallet, const std::string& strDest);
class CDBEnv
{
private:
bool fDetachDB;
bool fDbEnvInit;
boost::filesystem::path pathEnv;
bool fMockDb;
boost::filesystem::path path;
void EnvShutdown();
@@ -49,14 +45,34 @@ public:
CDBEnv();
~CDBEnv();
bool Open(boost::filesystem::path pathEnv_);
void MakeMock();
bool IsMock() { return fMockDb; }
/*
* Verify that database file strFile is OK. If it is not,
* call the callback to try to recover.
* This must be called BEFORE strFile is opened.
* Returns true if strFile is OK.
*/
enum VerifyResult { VERIFY_OK, RECOVER_OK, RECOVER_FAIL };
VerifyResult Verify(std::string strFile, bool (*recoverFunc)(CDBEnv& dbenv, std::string strFile));
/*
* Salvage data from a file that Verify says is bad.
* fAggressive sets the DB_AGGRESSIVE flag (see berkeley DB->verify() method documentation).
* Appends binary key/value pairs to vResult, returns true if successful.
* NOTE: reads the entire database into memory, so cannot be used
* for huge databases.
*/
typedef std::pair<std::vector<unsigned char>, std::vector<unsigned char> > KeyValPair;
bool Salvage(std::string strFile, bool fAggressive, std::vector<KeyValPair>& vResult);
bool Open(const boost::filesystem::path &path);
void Close();
void Flush(bool fShutdown);
void CheckpointLSN(std::string strFile);
void SetDetach(bool fDetachDB_) { fDetachDB = fDetachDB_; }
bool GetDetach() { return fDetachDB; }
void CloseDb(const std::string& strFile);
bool RemoveDb(const std::string& strFile);
DbTxn *TxnBegin(int flags=DB_TXN_WRITE_NOSYNC)
{
@@ -83,6 +99,7 @@ protected:
explicit CDB(const char* pszFile, const char* pszMode="r+");
~CDB() { Close(); }
public:
void Flush();
void Close();
private:
CDB(const CDB&);
@@ -295,36 +312,6 @@ public:
/** Access to the transaction database (blkindex.dat) */
class CTxDB : public CDB
{
public:
CTxDB(const char* pszMode="r+") : CDB("blkindex.dat", pszMode) { }
private:
CTxDB(const CTxDB&);
void operator=(const CTxDB&);
public:
bool ReadTxIndex(uint256 hash, CTxIndex& txindex);
bool UpdateTxIndex(uint256 hash, const CTxIndex& txindex);
bool AddTxIndex(const CTransaction& tx, const CDiskTxPos& pos, int nHeight);
bool EraseTxIndex(const CTransaction& tx);
bool ContainsTx(uint256 hash);
bool ReadDiskTx(uint256 hash, CTransaction& tx, CTxIndex& txindex);
bool ReadDiskTx(uint256 hash, CTransaction& tx);
bool ReadDiskTx(COutPoint outpoint, CTransaction& tx, CTxIndex& txindex);
bool ReadDiskTx(COutPoint outpoint, CTransaction& tx);
bool WriteBlockIndex(const CDiskBlockIndex& blockindex);
bool ReadHashBestChain(uint256& hashBestChain);
bool WriteHashBestChain(uint256 hashBestChain);
bool ReadBestInvalidWork(CBigNum& bnBestInvalidWork);
bool WriteBestInvalidWork(CBigNum bnBestInvalidWork);
bool LoadBlockIndex();
private:
bool LoadBlockIndexGuts();
};
/** Access to the (IP) address database (peers.dat) */
class CAddrDB

58
src/hash.cpp Normal file
View File

@@ -0,0 +1,58 @@
#include "hash.h"
inline uint32_t ROTL32 ( uint32_t x, int8_t r )
{
return (x << r) | (x >> (32 - r));
}
unsigned int MurmurHash3(unsigned int nHashSeed, const std::vector<unsigned char>& vDataToHash)
{
// The following is MurmurHash3 (x86_32), see http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
uint32_t h1 = nHashSeed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
const int nblocks = vDataToHash.size() / 4;
//----------
// body
const uint32_t * blocks = (const uint32_t *)(&vDataToHash[0] + nblocks*4);
for(int i = -nblocks; i; i++)
{
uint32_t k1 = blocks[i];
k1 *= c1;
k1 = ROTL32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
//----------
// tail
const uint8_t * tail = (const uint8_t*)(&vDataToHash[0] + nblocks*4);
uint32_t k1 = 0;
switch(vDataToHash.size() & 3)
{
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1: k1 ^= tail[0];
k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= vDataToHash.size();
h1 ^= h1 >> 16;
h1 *= 0x85ebca6b;
h1 ^= h1 >> 13;
h1 *= 0xc2b2ae35;
h1 ^= h1 >> 16;
return h1;
}

126
src/hash.h Normal file
View File

@@ -0,0 +1,126 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_HASH_H
#define BITCOIN_HASH_H
#include "uint256.h"
#include "serialize.h"
#include <openssl/sha.h>
#include <openssl/ripemd.h>
#include <vector>
template<typename T1>
inline uint256 Hash(const T1 pbegin, const T1 pend)
{
static unsigned char pblank[1];
uint256 hash1;
SHA256((pbegin == pend ? pblank : (unsigned char*)&pbegin[0]), (pend - pbegin) * sizeof(pbegin[0]), (unsigned char*)&hash1);
uint256 hash2;
SHA256((unsigned char*)&hash1, sizeof(hash1), (unsigned char*)&hash2);
return hash2;
}
class CHashWriter
{
private:
SHA256_CTX ctx;
public:
int nType;
int nVersion;
void Init() {
SHA256_Init(&ctx);
}
CHashWriter(int nTypeIn, int nVersionIn) : nType(nTypeIn), nVersion(nVersionIn) {
Init();
}
CHashWriter& write(const char *pch, size_t size) {
SHA256_Update(&ctx, pch, size);
return (*this);
}
// invalidates the object
uint256 GetHash() {
uint256 hash1;
SHA256_Final((unsigned char*)&hash1, &ctx);
uint256 hash2;
SHA256((unsigned char*)&hash1, sizeof(hash1), (unsigned char*)&hash2);
return hash2;
}
template<typename T>
CHashWriter& operator<<(const T& obj) {
// Serialize to this stream
::Serialize(*this, obj, nType, nVersion);
return (*this);
}
};
template<typename T1, typename T2>
inline uint256 Hash(const T1 p1begin, const T1 p1end,
const T2 p2begin, const T2 p2end)
{
static unsigned char pblank[1];
uint256 hash1;
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, (p1begin == p1end ? pblank : (unsigned char*)&p1begin[0]), (p1end - p1begin) * sizeof(p1begin[0]));
SHA256_Update(&ctx, (p2begin == p2end ? pblank : (unsigned char*)&p2begin[0]), (p2end - p2begin) * sizeof(p2begin[0]));
SHA256_Final((unsigned char*)&hash1, &ctx);
uint256 hash2;
SHA256((unsigned char*)&hash1, sizeof(hash1), (unsigned char*)&hash2);
return hash2;
}
template<typename T1, typename T2, typename T3>
inline uint256 Hash(const T1 p1begin, const T1 p1end,
const T2 p2begin, const T2 p2end,
const T3 p3begin, const T3 p3end)
{
static unsigned char pblank[1];
uint256 hash1;
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, (p1begin == p1end ? pblank : (unsigned char*)&p1begin[0]), (p1end - p1begin) * sizeof(p1begin[0]));
SHA256_Update(&ctx, (p2begin == p2end ? pblank : (unsigned char*)&p2begin[0]), (p2end - p2begin) * sizeof(p2begin[0]));
SHA256_Update(&ctx, (p3begin == p3end ? pblank : (unsigned char*)&p3begin[0]), (p3end - p3begin) * sizeof(p3begin[0]));
SHA256_Final((unsigned char*)&hash1, &ctx);
uint256 hash2;
SHA256((unsigned char*)&hash1, sizeof(hash1), (unsigned char*)&hash2);
return hash2;
}
template<typename T>
uint256 SerializeHash(const T& obj, int nType=SER_GETHASH, int nVersion=PROTOCOL_VERSION)
{
CHashWriter ss(nType, nVersion);
ss << obj;
return ss.GetHash();
}
template<typename T1>
inline uint160 Hash160(const T1 pbegin, const T1 pend)
{
static unsigned char pblank[1];
uint256 hash1;
SHA256((pbegin == pend ? pblank : (unsigned char*)&pbegin[0]), (pend - pbegin) * sizeof(pbegin[0]), (unsigned char*)&hash1);
uint160 hash2;
RIPEMD160((unsigned char*)&hash1, sizeof(hash1), (unsigned char*)&hash2);
return hash2;
}
inline uint160 Hash160(const std::vector<unsigned char>& vch)
{
return Hash160(vch.begin(), vch.end());
}
unsigned int MurmurHash3(unsigned int nHashSeed, const std::vector<unsigned char>& vDataToHash);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_INIT_H
@@ -12,8 +10,9 @@
extern CWallet* pwalletMain;
void StartShutdown();
void Shutdown(void* parg);
bool AppInit2();
bool ShutdownRequested();
void Shutdown();
bool AppInit2(boost::thread_group& threadGroup);
std::string HelpMessage();
#endif

View File

@@ -28,7 +28,8 @@ namespace json_spirit
template< class String_type >
String_type non_printable_to_string( unsigned int c )
{
typedef typename String_type::value_type Char_type;
// Silence the warning: typedef Char_type locally defined but not used [-Wunused-local-typedefs]
// typedef typename String_type::value_type Char_type;
String_type result( 6, '\\' );

View File

@@ -1,16 +1,17 @@
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include <map>
#include <openssl/ecdsa.h>
#include <openssl/rand.h>
#include <openssl/obj_mac.h>
#include "key.h"
// anonymous namespace with local implementation code (OpenSSL interaction)
namespace {
// Generate a private key from just the secret parameter
int EC_KEY_regenerate_key(EC_KEY *eckey, BIGNUM *priv_key)
{
@@ -50,7 +51,7 @@ err:
// Perform ECDSA key recovery (see SEC1 4.1.6) for curves over (mod p)-fields
// recid selects which key is recovered
// if check is nonzero, additional checks are performed
// if check is non-zero, additional checks are performed
int ECDSA_SIG_recover_key_GFp(EC_KEY *eckey, ECDSA_SIG *ecsig, const unsigned char *msg, int msglen, int recid, int check)
{
if (!eckey) return 0;
@@ -122,266 +123,274 @@ err:
return ret;
}
void CKey::SetCompressedPubKey()
{
EC_KEY_set_conv_form(pkey, POINT_CONVERSION_COMPRESSED);
fCompressedPubKey = true;
}
// RAII Wrapper around OpenSSL's EC_KEY
class CECKey {
private:
EC_KEY *pkey;
void CKey::Reset()
{
fCompressedPubKey = false;
if (pkey != NULL)
public:
CECKey() {
pkey = EC_KEY_new_by_curve_name(NID_secp256k1);
assert(pkey != NULL);
}
~CECKey() {
EC_KEY_free(pkey);
pkey = EC_KEY_new_by_curve_name(NID_secp256k1);
if (pkey == NULL)
throw key_error("CKey::CKey() : EC_KEY_new_by_curve_name failed");
fSet = false;
}
CKey::CKey()
{
pkey = NULL;
Reset();
}
CKey::CKey(const CKey& b)
{
pkey = EC_KEY_dup(b.pkey);
if (pkey == NULL)
throw key_error("CKey::CKey(const CKey&) : EC_KEY_dup failed");
fSet = b.fSet;
}
CKey& CKey::operator=(const CKey& b)
{
if (!EC_KEY_copy(pkey, b.pkey))
throw key_error("CKey::operator=(const CKey&) : EC_KEY_copy failed");
fSet = b.fSet;
return (*this);
}
CKey::~CKey()
{
EC_KEY_free(pkey);
}
bool CKey::IsNull() const
{
return !fSet;
}
bool CKey::IsCompressed() const
{
return fCompressedPubKey;
}
void CKey::MakeNewKey(bool fCompressed)
{
if (!EC_KEY_generate_key(pkey))
throw key_error("CKey::MakeNewKey() : EC_KEY_generate_key failed");
if (fCompressed)
SetCompressedPubKey();
fSet = true;
}
bool CKey::SetPrivKey(const CPrivKey& vchPrivKey)
{
const unsigned char* pbegin = &vchPrivKey[0];
if (!d2i_ECPrivateKey(&pkey, &pbegin, vchPrivKey.size()))
return false;
fSet = true;
return true;
}
bool CKey::SetSecret(const CSecret& vchSecret, bool fCompressed)
{
EC_KEY_free(pkey);
pkey = EC_KEY_new_by_curve_name(NID_secp256k1);
if (pkey == NULL)
throw key_error("CKey::SetSecret() : EC_KEY_new_by_curve_name failed");
if (vchSecret.size() != 32)
throw key_error("CKey::SetSecret() : secret must be 32 bytes");
BIGNUM *bn = BN_bin2bn(&vchSecret[0],32,BN_new());
if (bn == NULL)
throw key_error("CKey::SetSecret() : BN_bin2bn failed");
if (!EC_KEY_regenerate_key(pkey,bn))
{
BN_clear_free(bn);
throw key_error("CKey::SetSecret() : EC_KEY_regenerate_key failed");
}
BN_clear_free(bn);
fSet = true;
if (fCompressed || fCompressedPubKey)
SetCompressedPubKey();
return true;
}
CSecret CKey::GetSecret(bool &fCompressed) const
{
CSecret vchRet;
vchRet.resize(32);
const BIGNUM *bn = EC_KEY_get0_private_key(pkey);
int nBytes = BN_num_bytes(bn);
if (bn == NULL)
throw key_error("CKey::GetSecret() : EC_KEY_get0_private_key failed");
int n=BN_bn2bin(bn,&vchRet[32 - nBytes]);
if (n != nBytes)
throw key_error("CKey::GetSecret(): BN_bn2bin failed");
fCompressed = fCompressedPubKey;
return vchRet;
}
CPrivKey CKey::GetPrivKey() const
{
int nSize = i2d_ECPrivateKey(pkey, NULL);
if (!nSize)
throw key_error("CKey::GetPrivKey() : i2d_ECPrivateKey failed");
CPrivKey vchPrivKey(nSize, 0);
unsigned char* pbegin = &vchPrivKey[0];
if (i2d_ECPrivateKey(pkey, &pbegin) != nSize)
throw key_error("CKey::GetPrivKey() : i2d_ECPrivateKey returned unexpected size");
return vchPrivKey;
}
bool CKey::SetPubKey(const CPubKey& vchPubKey)
{
const unsigned char* pbegin = &vchPubKey.vchPubKey[0];
if (!o2i_ECPublicKey(&pkey, &pbegin, vchPubKey.vchPubKey.size()))
return false;
fSet = true;
if (vchPubKey.vchPubKey.size() == 33)
SetCompressedPubKey();
return true;
}
CPubKey CKey::GetPubKey() const
{
int nSize = i2o_ECPublicKey(pkey, NULL);
if (!nSize)
throw key_error("CKey::GetPubKey() : i2o_ECPublicKey failed");
std::vector<unsigned char> vchPubKey(nSize, 0);
unsigned char* pbegin = &vchPubKey[0];
if (i2o_ECPublicKey(pkey, &pbegin) != nSize)
throw key_error("CKey::GetPubKey() : i2o_ECPublicKey returned unexpected size");
return CPubKey(vchPubKey);
}
bool CKey::Sign(uint256 hash, std::vector<unsigned char>& vchSig)
{
unsigned int nSize = ECDSA_size(pkey);
vchSig.resize(nSize); // Make sure it is big enough
if (!ECDSA_sign(0, (unsigned char*)&hash, sizeof(hash), &vchSig[0], &nSize, pkey))
{
vchSig.clear();
return false;
void GetSecretBytes(unsigned char vch[32]) const {
const BIGNUM *bn = EC_KEY_get0_private_key(pkey);
assert(bn);
int nBytes = BN_num_bytes(bn);
int n=BN_bn2bin(bn,&vch[32 - nBytes]);
assert(n == nBytes);
memset(vch, 0, 32 - nBytes);
}
vchSig.resize(nSize); // Shrink to fit actual size
return true;
}
// create a compact signature (65 bytes), which allows reconstructing the used public key
// The format is one header byte, followed by two times 32 bytes for the serialized r and s values.
// The header byte: 0x1B = first key with even y, 0x1C = first key with odd y,
// 0x1D = second key with even y, 0x1E = second key with odd y
bool CKey::SignCompact(uint256 hash, std::vector<unsigned char>& vchSig)
{
bool fOk = false;
ECDSA_SIG *sig = ECDSA_do_sign((unsigned char*)&hash, sizeof(hash), pkey);
if (sig==NULL)
return false;
vchSig.clear();
vchSig.resize(65,0);
int nBitsR = BN_num_bits(sig->r);
int nBitsS = BN_num_bits(sig->s);
if (nBitsR <= 256 && nBitsS <= 256)
{
int nRecId = -1;
for (int i=0; i<4; i++)
{
CKey keyRec;
keyRec.fSet = true;
if (fCompressedPubKey)
keyRec.SetCompressedPubKey();
if (ECDSA_SIG_recover_key_GFp(keyRec.pkey, sig, (unsigned char*)&hash, sizeof(hash), i, 1) == 1)
if (keyRec.GetPubKey() == this->GetPubKey())
{
nRecId = i;
break;
}
void SetSecretBytes(const unsigned char vch[32]) {
BIGNUM bn;
BN_init(&bn);
assert(BN_bin2bn(vch, 32, &bn));
assert(EC_KEY_regenerate_key(pkey, &bn));
BN_clear_free(&bn);
}
void GetPrivKey(CPrivKey &privkey, bool fCompressed) {
EC_KEY_set_conv_form(pkey, fCompressed ? POINT_CONVERSION_COMPRESSED : POINT_CONVERSION_UNCOMPRESSED);
int nSize = i2d_ECPrivateKey(pkey, NULL);
assert(nSize);
privkey.resize(nSize);
unsigned char* pbegin = &privkey[0];
int nSize2 = i2d_ECPrivateKey(pkey, &pbegin);
assert(nSize == nSize2);
}
bool SetPrivKey(const CPrivKey &privkey) {
const unsigned char* pbegin = &privkey[0];
if (d2i_ECPrivateKey(&pkey, &pbegin, privkey.size())) {
// d2i_ECPrivateKey returns true if parsing succeeds.
// This doesn't necessarily mean the key is valid.
if (EC_KEY_check_key(pkey))
return true;
}
if (nRecId == -1)
throw key_error("CKey::SignCompact() : unable to construct recoverable key");
vchSig[0] = nRecId+27+(fCompressedPubKey ? 4 : 0);
BN_bn2bin(sig->r,&vchSig[33-(nBitsR+7)/8]);
BN_bn2bin(sig->s,&vchSig[65-(nBitsS+7)/8]);
fOk = true;
}
ECDSA_SIG_free(sig);
return fOk;
}
// reconstruct public key from a compact signature
// This is only slightly more CPU intensive than just verifying it.
// If this function succeeds, the recovered public key is guaranteed to be valid
// (the signature is a valid signature of the given data for that key)
bool CKey::SetCompactSignature(uint256 hash, const std::vector<unsigned char>& vchSig)
{
if (vchSig.size() != 65)
return false;
int nV = vchSig[0];
if (nV<27 || nV>=35)
return false;
ECDSA_SIG *sig = ECDSA_SIG_new();
BN_bin2bn(&vchSig[1],32,sig->r);
BN_bin2bn(&vchSig[33],32,sig->s);
EC_KEY_free(pkey);
pkey = EC_KEY_new_by_curve_name(NID_secp256k1);
if (nV >= 31)
{
SetCompressedPubKey();
nV -= 4;
}
if (ECDSA_SIG_recover_key_GFp(pkey, sig, (unsigned char*)&hash, sizeof(hash), nV - 27, 0) == 1)
{
fSet = true;
ECDSA_SIG_free(sig);
void GetPubKey(CPubKey &pubkey, bool fCompressed) {
EC_KEY_set_conv_form(pkey, fCompressed ? POINT_CONVERSION_COMPRESSED : POINT_CONVERSION_UNCOMPRESSED);
int nSize = i2o_ECPublicKey(pkey, NULL);
assert(nSize);
assert(nSize <= 65);
unsigned char c[65];
unsigned char *pbegin = c;
int nSize2 = i2o_ECPublicKey(pkey, &pbegin);
assert(nSize == nSize2);
pubkey.Set(&c[0], &c[nSize]);
}
bool SetPubKey(const CPubKey &pubkey) {
const unsigned char* pbegin = pubkey.begin();
return o2i_ECPublicKey(&pkey, &pbegin, pubkey.size());
}
bool Sign(const uint256 &hash, std::vector<unsigned char>& vchSig) {
unsigned int nSize = ECDSA_size(pkey);
vchSig.resize(nSize); // Make sure it is big enough
assert(ECDSA_sign(0, (unsigned char*)&hash, sizeof(hash), &vchSig[0], &nSize, pkey));
vchSig.resize(nSize); // Shrink to fit actual size
return true;
}
return false;
}
bool CKey::Verify(uint256 hash, const std::vector<unsigned char>& vchSig)
{
// -1 = error, 0 = bad sig, 1 = good
if (ECDSA_verify(0, (unsigned char*)&hash, sizeof(hash), &vchSig[0], vchSig.size(), pkey) != 1)
bool Verify(const uint256 &hash, const std::vector<unsigned char>& vchSig) {
// -1 = error, 0 = bad sig, 1 = good
if (ECDSA_verify(0, (unsigned char*)&hash, sizeof(hash), &vchSig[0], vchSig.size(), pkey) != 1)
return false;
return true;
}
bool SignCompact(const uint256 &hash, unsigned char *p64, int &rec) {
bool fOk = false;
ECDSA_SIG *sig = ECDSA_do_sign((unsigned char*)&hash, sizeof(hash), pkey);
if (sig==NULL)
return false;
memset(p64, 0, 64);
int nBitsR = BN_num_bits(sig->r);
int nBitsS = BN_num_bits(sig->s);
if (nBitsR <= 256 && nBitsS <= 256) {
CPubKey pubkey;
GetPubKey(pubkey, true);
for (int i=0; i<4; i++) {
CECKey keyRec;
if (ECDSA_SIG_recover_key_GFp(keyRec.pkey, sig, (unsigned char*)&hash, sizeof(hash), i, 1) == 1) {
CPubKey pubkeyRec;
keyRec.GetPubKey(pubkeyRec, true);
if (pubkeyRec == pubkey) {
rec = i;
fOk = true;
break;
}
}
}
assert(fOk);
BN_bn2bin(sig->r,&p64[32-(nBitsR+7)/8]);
BN_bn2bin(sig->s,&p64[64-(nBitsS+7)/8]);
}
ECDSA_SIG_free(sig);
return fOk;
}
// reconstruct public key from a compact signature
// This is only slightly more CPU intensive than just verifying it.
// If this function succeeds, the recovered public key is guaranteed to be valid
// (the signature is a valid signature of the given data for that key)
bool Recover(const uint256 &hash, const unsigned char *p64, int rec)
{
if (rec<0 || rec>=3)
return false;
ECDSA_SIG *sig = ECDSA_SIG_new();
BN_bin2bn(&p64[0], 32, sig->r);
BN_bin2bn(&p64[32], 32, sig->s);
bool ret = ECDSA_SIG_recover_key_GFp(pkey, sig, (unsigned char*)&hash, sizeof(hash), rec, 0) == 1;
ECDSA_SIG_free(sig);
return ret;
}
};
}; // end of anonymous namespace
bool CKey::Check(const unsigned char *vch) {
// Do not convert to OpenSSL's data structures for range-checking keys,
// it's easy enough to do directly.
static const unsigned char vchMax[32] = {
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x40
};
bool fIsZero = true;
for (int i=0; i<32 && fIsZero; i++)
if (vch[i] != 0)
fIsZero = false;
if (fIsZero)
return false;
for (int i=0; i<32; i++) {
if (vch[i] < vchMax[i])
return true;
if (vch[i] > vchMax[i])
return false;
}
return true;
}
bool CKey::VerifyCompact(uint256 hash, const std::vector<unsigned char>& vchSig)
{
CKey key;
if (!key.SetCompactSignature(hash, vchSig))
return false;
if (GetPubKey() != key.GetPubKey())
return false;
void CKey::MakeNewKey(bool fCompressedIn) {
do {
RAND_bytes(vch, sizeof(vch));
} while (!Check(vch));
fValid = true;
fCompressed = fCompressedIn;
}
bool CKey::SetPrivKey(const CPrivKey &privkey, bool fCompressedIn) {
CECKey key;
if (!key.SetPrivKey(privkey))
return false;
key.GetSecretBytes(vch);
fCompressed = fCompressedIn;
fValid = true;
return true;
}
bool CKey::IsValid()
{
if (!fSet)
return false;
bool fCompr;
CSecret secret = GetSecret(fCompr);
CKey key2;
key2.SetSecret(secret, fCompr);
return GetPubKey() == key2.GetPubKey();
CPrivKey CKey::GetPrivKey() const {
assert(fValid);
CECKey key;
key.SetSecretBytes(vch);
CPrivKey privkey;
key.GetPrivKey(privkey, fCompressed);
return privkey;
}
CPubKey CKey::GetPubKey() const {
assert(fValid);
CECKey key;
key.SetSecretBytes(vch);
CPubKey pubkey;
key.GetPubKey(pubkey, fCompressed);
return pubkey;
}
bool CKey::Sign(const uint256 &hash, std::vector<unsigned char>& vchSig) const {
if (!fValid)
return false;
CECKey key;
key.SetSecretBytes(vch);
return key.Sign(hash, vchSig);
}
bool CKey::SignCompact(const uint256 &hash, std::vector<unsigned char>& vchSig) const {
if (!fValid)
return false;
CECKey key;
key.SetSecretBytes(vch);
vchSig.resize(65);
int rec = -1;
if (!key.SignCompact(hash, &vchSig[1], rec))
return false;
assert(rec != -1);
vchSig[0] = 27 + rec + (fCompressed ? 4 : 0);
return true;
}
bool CPubKey::Verify(const uint256 &hash, const std::vector<unsigned char>& vchSig) const {
if (!IsValid())
return false;
CECKey key;
if (!key.SetPubKey(*this))
return false;
if (!key.Verify(hash, vchSig))
return false;
return true;
}
bool CPubKey::RecoverCompact(const uint256 &hash, const std::vector<unsigned char>& vchSig) {
if (vchSig.size() != 65)
return false;
CECKey key;
if (!key.Recover(hash, &vchSig[1], (vchSig[0] - 27) & ~4))
return false;
key.GetPubKey(*this, (vchSig[0] - 27) & 4);
return true;
}
bool CPubKey::VerifyCompact(const uint256 &hash, const std::vector<unsigned char>& vchSig) const {
if (!IsValid())
return false;
if (vchSig.size() != 65)
return false;
CECKey key;
if (!key.Recover(hash, &vchSig[1], (vchSig[0] - 27) & ~4))
return false;
CPubKey pubkeyRec;
key.GetPubKey(pubkeyRec, IsCompressed());
if (*this != pubkeyRec)
return false;
return true;
}
bool CPubKey::IsFullyValid() const {
if (!IsValid())
return false;
CECKey key;
if (!key.SetPubKey(*this))
return false;
return true;
}
bool CPubKey::Decompress() {
if (!IsValid())
return false;
CECKey key;
if (!key.SetPubKey(*this))
return false;
key.GetPubKey(*this, false);
return true;
}

260
src/key.h
View File

@@ -1,37 +1,17 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Copyright (c) 2009-2013 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_KEY_H
#define BITCOIN_KEY_H
#include <stdexcept>
#include <vector>
#include "allocators.h"
#include "serialize.h"
#include "uint256.h"
#include "util.h"
#include "hash.h"
#include <openssl/ec.h> // for EC_KEY definition
// secp160k1
// const unsigned int PRIVATE_KEY_SIZE = 192;
// const unsigned int PUBLIC_KEY_SIZE = 41;
// const unsigned int SIGNATURE_SIZE = 48;
//
// secp192k1
// const unsigned int PRIVATE_KEY_SIZE = 222;
// const unsigned int PUBLIC_KEY_SIZE = 49;
// const unsigned int SIGNATURE_SIZE = 57;
//
// secp224k1
// const unsigned int PRIVATE_KEY_SIZE = 250;
// const unsigned int PUBLIC_KEY_SIZE = 57;
// const unsigned int SIGNATURE_SIZE = 66;
//
// secp256k1:
// const unsigned int PRIVATE_KEY_SIZE = 279;
// const unsigned int PUBLIC_KEY_SIZE = 65;
@@ -40,12 +20,6 @@
// see www.keylength.com
// script supports up to 75 for single byte push
class key_error : public std::runtime_error
{
public:
explicit key_error(const std::string& str) : std::runtime_error(str) {}
};
/** A reference to a CKey: the Hash160 of its serialized public key */
class CKeyID : public uint160
{
@@ -65,100 +39,218 @@ public:
/** An encapsulated public key. */
class CPubKey {
private:
std::vector<unsigned char> vchPubKey;
friend class CKey;
// Just store the serialized data.
// Its length can very cheaply be computed from the first byte.
unsigned char vch[65];
// Compute the length of a pubkey with a given first byte.
unsigned int static GetLen(unsigned char chHeader) {
if (chHeader == 2 || chHeader == 3)
return 33;
if (chHeader == 4 || chHeader == 6 || chHeader == 7)
return 65;
return 0;
}
// Set this key data to be invalid
void Invalidate() {
vch[0] = 0xFF;
}
public:
CPubKey() { }
CPubKey(const std::vector<unsigned char> &vchPubKeyIn) : vchPubKey(vchPubKeyIn) { }
friend bool operator==(const CPubKey &a, const CPubKey &b) { return a.vchPubKey == b.vchPubKey; }
friend bool operator!=(const CPubKey &a, const CPubKey &b) { return a.vchPubKey != b.vchPubKey; }
friend bool operator<(const CPubKey &a, const CPubKey &b) { return a.vchPubKey < b.vchPubKey; }
// Construct an invalid public key.
CPubKey() {
Invalidate();
}
IMPLEMENT_SERIALIZE(
READWRITE(vchPubKey);
)
// Initialize a public key using begin/end iterators to byte data.
template<typename T>
void Set(const T pbegin, const T pend) {
int len = pend == pbegin ? 0 : GetLen(pbegin[0]);
if (len && len == (pend-pbegin))
memcpy(vch, (unsigned char*)&pbegin[0], len);
else
Invalidate();
}
// Construct a public key using begin/end iterators to byte data.
template<typename T>
CPubKey(const T pbegin, const T pend) {
Set(pbegin, pend);
}
// Construct a public key from a byte vector.
CPubKey(const std::vector<unsigned char> &vch) {
Set(vch.begin(), vch.end());
}
// Simple read-only vector-like interface to the pubkey data.
unsigned int size() const { return GetLen(vch[0]); }
const unsigned char *begin() const { return vch; }
const unsigned char *end() const { return vch+size(); }
const unsigned char &operator[](unsigned int pos) const { return vch[pos]; }
// Comparator implementation.
friend bool operator==(const CPubKey &a, const CPubKey &b) {
return a.vch[0] == b.vch[0] &&
memcmp(a.vch, b.vch, a.size()) == 0;
}
friend bool operator!=(const CPubKey &a, const CPubKey &b) {
return !(a == b);
}
friend bool operator<(const CPubKey &a, const CPubKey &b) {
return a.vch[0] < b.vch[0] ||
(a.vch[0] == b.vch[0] && memcmp(a.vch, b.vch, a.size()) < 0);
}
// Implement serialization, as if this was a byte vector.
unsigned int GetSerializeSize(int nType, int nVersion) const {
return size() + 1;
}
template<typename Stream> void Serialize(Stream &s, int nType, int nVersion) const {
unsigned int len = size();
::WriteCompactSize(s, len);
s.write((char*)vch, len);
}
template<typename Stream> void Unserialize(Stream &s, int nType, int nVersion) {
unsigned int len = ::ReadCompactSize(s);
if (len <= 65) {
s.read((char*)vch, len);
} else {
// invalid pubkey, skip available data
char dummy;
while (len--)
s.read(&dummy, 1);
Invalidate();
}
}
// Get the KeyID of this public key (hash of its serialization)
CKeyID GetID() const {
return CKeyID(Hash160(vchPubKey));
return CKeyID(Hash160(vch, vch+size()));
}
// Get the 256-bit hash of this public key.
uint256 GetHash() const {
return Hash(vchPubKey.begin(), vchPubKey.end());
return Hash(vch, vch+size());
}
// just check syntactic correctness.
bool IsValid() const {
return vchPubKey.size() == 33 || vchPubKey.size() == 65;
return size() > 0;
}
// fully validate whether this is a valid public key (more expensive than IsValid())
bool IsFullyValid() const;
// Check whether this is a compressed public key.
bool IsCompressed() const {
return vchPubKey.size() == 33;
return size() == 33;
}
std::vector<unsigned char> Raw() const {
return vchPubKey;
}
// Verify a DER signature (~72 bytes).
// If this public key is not fully valid, the return value will be false.
bool Verify(const uint256 &hash, const std::vector<unsigned char>& vchSig) const;
// Verify a compact signature (~65 bytes).
// See CKey::SignCompact.
bool VerifyCompact(const uint256 &hash, const std::vector<unsigned char>& vchSig) const;
// Recover a public key from a compact signature.
bool RecoverCompact(const uint256 &hash, const std::vector<unsigned char>& vchSig);
// Turn this public key into an uncompressed public key.
bool Decompress();
};
// secure_allocator is defined in serialize.h
// secure_allocator is defined in allocators.h
// CPrivKey is a serialized private key, with all parameters included (279 bytes)
typedef std::vector<unsigned char, secure_allocator<unsigned char> > CPrivKey;
// CSecret is a serialization of just the secret parameter (32 bytes)
typedef std::vector<unsigned char, secure_allocator<unsigned char> > CSecret;
/** An encapsulated OpenSSL Elliptic Curve key (public and/or private) */
class CKey
{
protected:
EC_KEY* pkey;
bool fSet;
bool fCompressedPubKey;
/** An encapsulated private key. */
class CKey {
private:
// Whether this private key is valid. We check for correctness when modifying the key
// data, so fValid should always correspond to the actual state.
bool fValid;
void SetCompressedPubKey();
// Whether the public key corresponding to this private key is (to be) compressed.
bool fCompressed;
// The actual byte data
unsigned char vch[32];
// Check whether the 32-byte array pointed to be vch is valid keydata.
bool static Check(const unsigned char *vch);
public:
void Reset();
// Construct an invalid private key.
CKey() : fValid(false) {
LockObject(vch);
}
CKey();
CKey(const CKey& b);
// Copy constructor. This is necessary because of memlocking.
CKey(const CKey &secret) : fValid(secret.fValid), fCompressed(secret.fCompressed) {
LockObject(vch);
memcpy(vch, secret.vch, sizeof(vch));
}
CKey& operator=(const CKey& b);
// Destructor (again necessary because of memlocking).
~CKey() {
UnlockObject(vch);
}
~CKey();
// Initialize using begin and end iterators to byte data.
template<typename T>
void Set(const T pbegin, const T pend, bool fCompressedIn) {
if (pend - pbegin != 32) {
fValid = false;
return;
}
if (Check(&pbegin[0])) {
memcpy(vch, (unsigned char*)&pbegin[0], 32);
fValid = true;
fCompressed = fCompressedIn;
} else {
fValid = false;
}
}
bool IsNull() const;
bool IsCompressed() const;
// Simple read-only vector-like interface.
unsigned int size() const { return (fValid ? 32 : 0); }
const unsigned char *begin() const { return vch; }
const unsigned char *end() const { return vch + size(); }
// Check whether this private key is valid.
bool IsValid() const { return fValid; }
// Check whether the public key corresponding to this private key is (to be) compressed.
bool IsCompressed() const { return fCompressed; }
// Initialize from a CPrivKey (serialized OpenSSL private key data).
bool SetPrivKey(const CPrivKey &vchPrivKey, bool fCompressed);
// Generate a new private key using a cryptographic PRNG.
void MakeNewKey(bool fCompressed);
bool SetPrivKey(const CPrivKey& vchPrivKey);
bool SetSecret(const CSecret& vchSecret, bool fCompressed = false);
CSecret GetSecret(bool &fCompressed) const;
// Convert the private key to a CPrivKey (serialized OpenSSL private key data).
// This is expensive.
CPrivKey GetPrivKey() const;
bool SetPubKey(const CPubKey& vchPubKey);
// Compute the public key from a private key.
// This is expensive.
CPubKey GetPubKey() const;
bool Sign(uint256 hash, std::vector<unsigned char>& vchSig);
// Create a DER-serialized signature.
bool Sign(const uint256 &hash, std::vector<unsigned char>& vchSig) const;
// create a compact signature (65 bytes), which allows reconstructing the used public key
// Create a compact signature (65 bytes), which allows reconstructing the used public key.
// The format is one header byte, followed by two times 32 bytes for the serialized r and s values.
// The header byte: 0x1B = first key with even y, 0x1C = first key with odd y,
// 0x1D = second key with even y, 0x1E = second key with odd y
bool SignCompact(uint256 hash, std::vector<unsigned char>& vchSig);
// reconstruct public key from a compact signature
// This is only slightly more CPU intensive than just verifying it.
// If this function succeeds, the recovered public key is guaranteed to be valid
// (the signature is a valid signature of the given data for that key)
bool SetCompactSignature(uint256 hash, const std::vector<unsigned char>& vchSig);
bool Verify(uint256 hash, const std::vector<unsigned char>& vchSig);
// Verify a compact signature
bool VerifyCompact(uint256 hash, const std::vector<unsigned char>& vchSig);
bool IsValid();
// 0x1D = second key with even y, 0x1E = second key with odd y,
// add 0x04 for compressed keys.
bool SignCompact(const uint256 &hash, std::vector<unsigned char>& vchSig) const;
};
#endif

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
@@ -17,61 +15,50 @@ bool CKeyStore::GetPubKey(const CKeyID &address, CPubKey &vchPubKeyOut) const
return true;
}
bool CBasicKeyStore::AddKey(const CKey& key)
bool CKeyStore::AddKey(const CKey &key) {
return AddKeyPubKey(key, key.GetPubKey());
}
bool CBasicKeyStore::AddKeyPubKey(const CKey& key, const CPubKey &pubkey)
{
bool fCompressed = false;
CSecret secret = key.GetSecret(fCompressed);
{
LOCK(cs_KeyStore);
mapKeys[key.GetPubKey().GetID()] = make_pair(secret, fCompressed);
}
LOCK(cs_KeyStore);
mapKeys[pubkey.GetID()] = key;
return true;
}
bool CBasicKeyStore::AddCScript(const CScript& redeemScript)
{
{
LOCK(cs_KeyStore);
mapScripts[redeemScript.GetID()] = redeemScript;
}
LOCK(cs_KeyStore);
mapScripts[redeemScript.GetID()] = redeemScript;
return true;
}
bool CBasicKeyStore::HaveCScript(const CScriptID& hash) const
{
bool result;
{
LOCK(cs_KeyStore);
result = (mapScripts.count(hash) > 0);
}
return result;
LOCK(cs_KeyStore);
return mapScripts.count(hash) > 0;
}
bool CBasicKeyStore::GetCScript(const CScriptID &hash, CScript& redeemScriptOut) const
{
LOCK(cs_KeyStore);
ScriptMap::const_iterator mi = mapScripts.find(hash);
if (mi != mapScripts.end())
{
LOCK(cs_KeyStore);
ScriptMap::const_iterator mi = mapScripts.find(hash);
if (mi != mapScripts.end())
{
redeemScriptOut = (*mi).second;
return true;
}
redeemScriptOut = (*mi).second;
return true;
}
return false;
}
bool CCryptoKeyStore::SetCrypted()
{
{
LOCK(cs_KeyStore);
if (fUseCrypto)
return true;
if (!mapKeys.empty())
return false;
fUseCrypto = true;
}
LOCK(cs_KeyStore);
if (fUseCrypto)
return true;
if (!mapKeys.empty())
return false;
fUseCrypto = true;
return true;
}
@@ -101,14 +88,13 @@ bool CCryptoKeyStore::Unlock(const CKeyingMaterial& vMasterKeyIn)
{
const CPubKey &vchPubKey = (*mi).second.first;
const std::vector<unsigned char> &vchCryptedSecret = (*mi).second.second;
CSecret vchSecret;
CKeyingMaterial vchSecret;
if(!DecryptSecret(vMasterKeyIn, vchCryptedSecret, vchPubKey.GetHash(), vchSecret))
return false;
if (vchSecret.size() != 32)
return false;
CKey key;
key.SetPubKey(vchPubKey);
key.SetSecret(vchSecret);
key.Set(vchSecret.begin(), vchSecret.end(), vchPubKey.IsCompressed());
if (key.GetPubKey() == vchPubKey)
break;
return false;
@@ -119,23 +105,22 @@ bool CCryptoKeyStore::Unlock(const CKeyingMaterial& vMasterKeyIn)
return true;
}
bool CCryptoKeyStore::AddKey(const CKey& key)
bool CCryptoKeyStore::AddKeyPubKey(const CKey& key, const CPubKey &pubkey)
{
{
LOCK(cs_KeyStore);
if (!IsCrypted())
return CBasicKeyStore::AddKey(key);
return CBasicKeyStore::AddKeyPubKey(key, pubkey);
if (IsLocked())
return false;
std::vector<unsigned char> vchCryptedSecret;
CPubKey vchPubKey = key.GetPubKey();
bool fCompressed;
if (!EncryptSecret(vMasterKey, key.GetSecret(fCompressed), vchPubKey.GetHash(), vchCryptedSecret))
CKeyingMaterial vchSecret(key.begin(), key.end());
if (!EncryptSecret(vMasterKey, vchSecret, pubkey.GetHash(), vchCryptedSecret))
return false;
if (!AddCryptedKey(key.GetPubKey(), vchCryptedSecret))
if (!AddCryptedKey(pubkey, vchCryptedSecret))
return false;
}
return true;
@@ -166,13 +151,12 @@ bool CCryptoKeyStore::GetKey(const CKeyID &address, CKey& keyOut) const
{
const CPubKey &vchPubKey = (*mi).second.first;
const std::vector<unsigned char> &vchCryptedSecret = (*mi).second.second;
CSecret vchSecret;
CKeyingMaterial vchSecret;
if (!DecryptSecret(vMasterKey, vchCryptedSecret, vchPubKey.GetHash(), vchSecret))
return false;
if (vchSecret.size() != 32)
return false;
keyOut.SetPubKey(vchPubKey);
keyOut.SetSecret(vchSecret);
keyOut.Set(vchSecret.begin(), vchSecret.end(), vchPubKey.IsCompressed());
return true;
}
}
@@ -206,13 +190,11 @@ bool CCryptoKeyStore::EncryptKeys(CKeyingMaterial& vMasterKeyIn)
fUseCrypto = true;
BOOST_FOREACH(KeyMap::value_type& mKey, mapKeys)
{
CKey key;
if (!key.SetSecret(mKey.second.first, mKey.second.second))
return false;
const CPubKey vchPubKey = key.GetPubKey();
const CKey &key = mKey.second;
CPubKey vchPubKey = key.GetPubKey();
CKeyingMaterial vchSecret(key.begin(), key.end());
std::vector<unsigned char> vchCryptedSecret;
bool fCompressed;
if (!EncryptSecret(vMasterKeyIn, key.GetSecret(fCompressed), vchPubKey.GetHash(), vchCryptedSecret))
if (!EncryptSecret(vMasterKeyIn, vchSecret, vchPubKey.GetHash(), vchCryptedSecret))
return false;
if (!AddCryptedKey(vchPubKey, vchCryptedSecret))
return false;

View File

@@ -1,7 +1,5 @@
// Copyright (c) 2009-2010 Satoshi Nakamoto
// Copyright (c) 2009-2012 The Bitcoin developers
// Copyright (c) 2011-2012 Litecoin Developers
// Copyright (c) 2013 CasinoCoin Developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_KEYSTORE_H
@@ -23,7 +21,8 @@ public:
virtual ~CKeyStore() {}
// Add a key to the store.
virtual bool AddKey(const CKey& key) =0;
virtual bool AddKeyPubKey(const CKey &key, const CPubKey &pubkey) =0;
virtual bool AddKey(const CKey &key);
// Check whether a key corresponding to a given address is present in the store.
virtual bool HaveKey(const CKeyID &address) const =0;
@@ -35,18 +34,9 @@ public:
virtual bool AddCScript(const CScript& redeemScript) =0;
virtual bool HaveCScript(const CScriptID &hash) const =0;
virtual bool GetCScript(const CScriptID &hash, CScript& redeemScriptOut) const =0;
virtual bool GetSecret(const CKeyID &address, CSecret& vchSecret, bool &fCompressed) const
{
CKey key;
if (!GetKey(address, key))
return false;
vchSecret = key.GetSecret(fCompressed);
return true;
}
};
typedef std::map<CKeyID, std::pair<CSecret, bool> > KeyMap;
typedef std::map<CKeyID, CKey> KeyMap;
typedef std::map<CScriptID, CScript > ScriptMap;
/** Basic key store, that keeps keys in an address->secret map */
@@ -57,7 +47,7 @@ protected:
ScriptMap mapScripts;
public:
bool AddKey(const CKey& key);
bool AddKeyPubKey(const CKey& key, const CPubKey &pubkey);
bool HaveKey(const CKeyID &address) const
{
bool result;
@@ -87,8 +77,7 @@ public:
KeyMap::const_iterator mi = mapKeys.find(address);
if (mi != mapKeys.end())
{
keyOut.Reset();
keyOut.SetSecret((*mi).second.first, (*mi).second.second);
keyOut = mi->second;
return true;
}
}
@@ -148,7 +137,7 @@ public:
bool Lock();
virtual bool AddCryptedKey(const CPubKey &vchPubKey, const std::vector<unsigned char> &vchCryptedSecret);
bool AddKey(const CKey& key);
bool AddKeyPubKey(const CKey& key, const CPubKey &pubkey);
bool HaveKey(const CKeyID &address) const
{
{

81
src/leveldb.cpp Normal file
View File

@@ -0,0 +1,81 @@
// Copyright (c) 2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include "leveldb.h"
#include "util.h"
#include <leveldb/env.h>
#include <leveldb/cache.h>
#include <leveldb/filter_policy.h>
#include <memenv/memenv.h>
#include <boost/filesystem.hpp>
void HandleError(const leveldb::Status &status) throw(leveldb_error) {
if (status.ok())
return;
if (status.IsCorruption())
throw leveldb_error("Database corrupted");
if (status.IsIOError())
throw leveldb_error("Database I/O error");
if (status.IsNotFound())
throw leveldb_error("Database entry missing");
throw leveldb_error("Unknown database error");
}
static leveldb::Options GetOptions(size_t nCacheSize) {
leveldb::Options options;
options.block_cache = leveldb::NewLRUCache(nCacheSize / 2);
options.write_buffer_size = nCacheSize / 4; // up to two write buffers may be held in memory simultaneously
options.filter_policy = leveldb::NewBloomFilterPolicy(10);
options.compression = leveldb::kNoCompression;
options.max_open_files = 64;
return options;
}
CLevelDB::CLevelDB(const boost::filesystem::path &path, size_t nCacheSize, bool fMemory, bool fWipe) {
penv = NULL;
readoptions.verify_checksums = true;
iteroptions.verify_checksums = true;
iteroptions.fill_cache = false;
syncoptions.sync = true;
options = GetOptions(nCacheSize);
options.create_if_missing = true;
if (fMemory) {
penv = leveldb::NewMemEnv(leveldb::Env::Default());
options.env = penv;
} else {
if (fWipe) {
printf("Wiping LevelDB in %s\n", path.string().c_str());
leveldb::DestroyDB(path.string(), options);
}
boost::filesystem::create_directory(path);
printf("Opening LevelDB in %s\n", path.string().c_str());
}
leveldb::Status status = leveldb::DB::Open(options, path.string(), &pdb);
if (!status.ok())
throw std::runtime_error(strprintf("CLevelDB(): error opening database environment %s", status.ToString().c_str()));
printf("Opened LevelDB successfully\n");
}
CLevelDB::~CLevelDB() {
delete pdb;
pdb = NULL;
delete options.filter_policy;
options.filter_policy = NULL;
delete options.block_cache;
options.block_cache = NULL;
delete penv;
options.env = NULL;
}
bool CLevelDB::WriteBatch(CLevelDBBatch &batch, bool fSync) throw(leveldb_error) {
leveldb::Status status = pdb->Write(fSync ? syncoptions : writeoptions, &batch.batch);
if (!status.ok()) {
printf("LevelDB write failure: %s\n", status.ToString().c_str());
HandleError(status);
return false;
}
return true;
}

153
src/leveldb.h Normal file
View File

@@ -0,0 +1,153 @@
// Copyright (c) 2012 The Bitcoin developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_LEVELDB_H
#define BITCOIN_LEVELDB_H
#include "serialize.h"
#include <leveldb/db.h>
#include <leveldb/write_batch.h>
#include <boost/filesystem/path.hpp>
class leveldb_error : public std::runtime_error
{
public:
leveldb_error(const std::string &msg) : std::runtime_error(msg) {}
};
void HandleError(const leveldb::Status &status) throw(leveldb_error);
// Batch of changes queued to be written to a CLevelDB
class CLevelDBBatch
{
friend class CLevelDB;
private:
leveldb::WriteBatch batch;
public:
template<typename K, typename V> void Write(const K& key, const V& value) {
CDataStream ssKey(SER_DISK, CLIENT_VERSION);
ssKey.reserve(ssKey.GetSerializeSize(key));
ssKey << key;
leveldb::Slice slKey(&ssKey[0], ssKey.size());
CDataStream ssValue(SER_DISK, CLIENT_VERSION);
ssValue.reserve(ssValue.GetSerializeSize(value));
ssValue << value;
leveldb::Slice slValue(&ssValue[0], ssValue.size());
batch.Put(slKey, slValue);
}
template<typename K> void Erase(const K& key) {
CDataStream ssKey(SER_DISK, CLIENT_VERSION);
ssKey.reserve(ssKey.GetSerializeSize(key));
ssKey << key;
leveldb::Slice slKey(&ssKey[0], ssKey.size());
batch.Delete(slKey);
}
};
class CLevelDB
{
private:
// custom environment this database is using (may be NULL in case of default environment)
leveldb::Env *penv;
// database options used
leveldb::Options options;
// options used when reading from the database
leveldb::ReadOptions readoptions;
// options used when iterating over values of the database
leveldb::ReadOptions iteroptions;
// options used when writing to the database
leveldb::WriteOptions writeoptions;
// options used when sync writing to the database
leveldb::WriteOptions syncoptions;
// the database itself
leveldb::DB *pdb;
public:
CLevelDB(const boost::filesystem::path &path, size_t nCacheSize, bool fMemory = false, bool fWipe = false);
~CLevelDB();
template<typename K, typename V> bool Read(const K& key, V& value) throw(leveldb_error) {
CDataStream ssKey(SER_DISK, CLIENT_VERSION);
ssKey.reserve(ssKey.GetSerializeSize(key));
ssKey << key;
leveldb::Slice slKey(&ssKey[0], ssKey.size());
std::string strValue;
leveldb::Status status = pdb->Get(readoptions, slKey, &strValue);
if (!status.ok()) {
if (status.IsNotFound())
return false;
printf("LevelDB read failure: %s\n", status.ToString().c_str());
HandleError(status);
}
try {
CDataStream ssValue(strValue.data(), strValue.data() + strValue.size(), SER_DISK, CLIENT_VERSION);
ssValue >> value;
} catch(std::exception &e) {
return false;
}
return true;
}
template<typename K, typename V> bool Write(const K& key, const V& value, bool fSync = false) throw(leveldb_error) {
CLevelDBBatch batch;
batch.Write(key, value);
return WriteBatch(batch, fSync);
}
template<typename K> bool Exists(const K& key) throw(leveldb_error) {
CDataStream ssKey(SER_DISK, CLIENT_VERSION);
ssKey.reserve(ssKey.GetSerializeSize(key));
ssKey << key;
leveldb::Slice slKey(&ssKey[0], ssKey.size());
std::string strValue;
leveldb::Status status = pdb->Get(readoptions, slKey, &strValue);
if (!status.ok()) {
if (status.IsNotFound())
return false;
printf("LevelDB read failure: %s\n", status.ToString().c_str());
HandleError(status);
}
return true;
}
template<typename K> bool Erase(const K& key, bool fSync = false) throw(leveldb_error) {
CLevelDBBatch batch;
batch.Erase(key);
return WriteBatch(batch, fSync);
}
bool WriteBatch(CLevelDBBatch &batch, bool fSync = false) throw(leveldb_error);
// not available for LevelDB; provide for compatibility with BDB
bool Flush() {
return true;
}
bool Sync() throw(leveldb_error) {
CLevelDBBatch batch;
return WriteBatch(batch, true);
}
// not exactly clean encapsulation, but it's easiest for now
leveldb::Iterator *NewIterator() {
return pdb->NewIterator(iteroptions);
}
};
#endif // BITCOIN_LEVELDB_H

13
src/leveldb/.gitignore vendored Normal file
View File

@@ -0,0 +1,13 @@
build_config.mk
*.a
*.o
*.dylib*
*.so
*.so.*
*_test
db_bench
leveldbutil
Release
Debug
Benchmark
vs2010.*

11
src/leveldb/AUTHORS Normal file
View File

@@ -0,0 +1,11 @@
# Names should be added to this file like so:
# Name or Organization <email address>
Google Inc.
# Initial version authors:
Jeffrey Dean <jeff@google.com>
Sanjay Ghemawat <sanjay@google.com>
# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>

27
src/leveldb/LICENSE Normal file
View File

@@ -0,0 +1,27 @@
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

17
src/leveldb/NEWS Normal file
View File

@@ -0,0 +1,17 @@
Release 1.2 2011-05-16
----------------------
Fixes for larger databases (tested up to one billion 100-byte entries,
i.e., ~100GB).
(1) Place hard limit on number of level-0 files. This fixes errors
of the form "too many open files".
(2) Fixed memtable management. Before the fix, a heavy write burst
could cause unbounded memory usage.
A fix for a logging bug where the reader would incorrectly complain
about corruption.
Allow public access to WriteBatch contents so that users can easily
wrap a DB.

51
src/leveldb/README Normal file
View File

@@ -0,0 +1,51 @@
leveldb: A key-value store
Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
The code under this directory implements a system for maintaining a
persistent key/value store.
See doc/index.html for more explanation.
See doc/impl.html for a brief overview of the implementation.
The public interface is in include/*.h. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Guide to header files:
include/db.h
Main interface to the DB: Start here
include/options.h
Control over the behavior of an entire database, and also
control over the behavior of individual reads and writes.
include/comparator.h
Abstraction for user-specified comparison function. If you want
just bytewise comparison of keys, you can use the default comparator,
but clients can write their own comparator implementations if they
want custom ordering (e.g. to handle different character
encodings, etc.)
include/iterator.h
Interface for iterating over data. You can get an iterator
from a DB object.
include/write_batch.h
Interface for atomically applying multiple updates to a database.
include/slice.h
A simple module for maintaining a pointer and a length into some
other byte array.
include/status.h
Status is returned from many of the public interfaces and is used
to report success and various kinds of errors.
include/env.h
Abstraction of the OS environment. A posix implementation of
this interface is in util/env_posix.cc
include/table.h
include/table_builder.h
Lower-level modules that most clients probably won't use directly

14
src/leveldb/TODO Normal file
View File

@@ -0,0 +1,14 @@
ss
- Stats
db
- Maybe implement DB::BulkDeleteForRange(start_key, end_key)
that would blow away files whose ranges are entirely contained
within [start_key..end_key]? For Chrome, deletion of obsolete
object stores, etc. can be done in the background anyway, so
probably not that important.
- There have been requests for MultiGet.
After a range is completely deleted, what gets rid of the
corresponding files if we do no future changes to that range. Make
the conditions for triggering compactions fire in more situations?

39
src/leveldb/WINDOWS.md Normal file
View File

@@ -0,0 +1,39 @@
# Building LevelDB On Windows
## Prereqs
Install the [Windows Software Development Kit version 7.1](http://www.microsoft.com/downloads/dlx/en-us/listdetailsview.aspx?FamilyID=6b6c21d2-2006-4afa-9702-529fa782d63b).
Download and extract the [Snappy source distribution](http://snappy.googlecode.com/files/snappy-1.0.5.tar.gz)
1. Open the "Windows SDK 7.1 Command Prompt" :
Start Menu -> "Microsoft Windows SDK v7.1" > "Windows SDK 7.1 Command Prompt"
2. Change the directory to the leveldb project
## Building the Static lib
* 32 bit Version
setenv /x86
msbuild.exe /p:Configuration=Release /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
* 64 bit Version
setenv /x64
msbuild.exe /p:Configuration=Release /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
## Building and Running the Benchmark app
* 32 bit Version
setenv /x86
msbuild.exe /p:Configuration=Benchmark /p:Platform=Win32 /p:Snappy=..\snappy-1.0.5
Benchmark\leveldb.exe
* 64 bit Version
setenv /x64
msbuild.exe /p:Configuration=Benchmark /p:Platform=x64 /p:Snappy=..\snappy-1.0.5
x64\Benchmark\leveldb.exe

View File

@@ -0,0 +1,214 @@
#!/bin/sh
#
# Detects OS we're compiling on and outputs a file specified by the first
# argument, which in turn gets read while processing Makefile.
#
# The output will set the following variables:
# CC C Compiler path
# CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags
# PLATFORM_LIBS Libraries flags
# PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
# This flag is embedded just before the name
# of the shared library without intervening spaces
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
# PLATFORM_CCFLAGS C compiler flags
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
# shared libraries, empty otherwise.
#
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
#
# -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
# -DLEVELDB_PLATFORM_POSIX for Posix-based platforms
# -DSNAPPY if the Snappy library is present
#
OUTPUT=$1
PREFIX=$2
if test -z "$OUTPUT" || test -z "$PREFIX"; then
echo "usage: $0 <output-filename> <directory_prefix>" >&2
exit 1
fi
# Delete existing output, if it exists
rm -f $OUTPUT
touch $OUTPUT
if test -z "$CC"; then
CC=cc
fi
if test -z "$CXX"; then
CXX=g++
fi
if test -z "$TMPDIR"; then
TMPDIR=/tmp
fi
# Detect OS
if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s`
fi
COMMON_FLAGS=
CROSS_COMPILE=
PLATFORM_CCFLAGS=
PLATFORM_CXXFLAGS=
PLATFORM_LDFLAGS=
PLATFORM_LIBS=
PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true
MEMCMP_FLAG=
if [ "$CXX" = "g++" ]; then
# Use libc's memcmp instead of GCC's memcmp. This results in ~40%
# performance improvement on readrandom under gcc 4.4.3 on Linux/x86.
MEMCMP_FLAG="-fno-builtin-memcmp"
fi
case "$TARGET_OS" in
Darwin)
PLATFORM=OS_MACOSX
COMMON_FLAGS="$MEMCMP_FLAG -DOS_MACOSX"
PLATFORM_SHARED_EXT=dylib
[ -z "$INSTALL_PATH" ] && INSTALL_PATH=`pwd`
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name $INSTALL_PATH/"
PORT_FILE=port/port_posix.cc
;;
Linux)
PLATFORM=OS_LINUX
COMMON_FLAGS="$MEMCMP_FLAG -pthread -DOS_LINUX"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
;;
SunOS)
PLATFORM=OS_SOLARIS
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_SOLARIS"
PLATFORM_LIBS="-lpthread -lrt"
PORT_FILE=port/port_posix.cc
;;
FreeBSD)
PLATFORM=OS_FREEBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_FREEBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
;;
GNU/kFreeBSD)
PLATFORM=OS_KFREEBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_KFREEBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_NETBSD"
PLATFORM_LIBS="-lpthread -lgcc_s"
PORT_FILE=port/port_posix.cc
;;
OpenBSD)
PLATFORM=OS_OPENBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
;;
DragonFly)
PLATFORM=OS_DRAGONFLYBSD
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_DRAGONFLYBSD"
PLATFORM_LIBS="-lpthread"
PORT_FILE=port/port_posix.cc
;;
OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="" # All pthread features are in the Android C library
PORT_FILE=port/port_posix.cc
CROSS_COMPILE=true
;;
HP-UX)
PLATFORM=OS_HPUX
COMMON_FLAGS="$MEMCMP_FLAG -D_REENTRANT -DOS_HPUX"
PLATFORM_LDFLAGS="-pthread"
PORT_FILE=port/port_posix.cc
# man ld: +h internal_name
PLATFORM_SHARED_LDFLAGS="-shared -Wl,+h -Wl,"
;;
OS_WINDOWS_CROSSCOMPILE | NATIVE_WINDOWS)
PLATFORM=OS_WINDOWS
COMMON_FLAGS="-fno-builtin-memcmp -D_REENTRANT -DOS_WINDOWS -DLEVELDB_PLATFORM_WINDOWS -DWINVER=0x0500 -D__USE_MINGW_ANSI_STDIO=1"
PLATFORM_SOURCES="util/env_win.cc"
PLATFORM_LIBS="-lshlwapi"
PORT_FILE=port/port_win.cc
CROSS_COMPILE=true
;;
*)
echo "Unknown platform!" >&2
exit 1
esac
# We want to make a list of all cc files within util, db, table, and helpers
# except for the test and benchmark files. By default, find will output a list
# of all files matching either rule, so we need to append -print to make the
# prune take effect.
DIRS="$PREFIX/db $PREFIX/util $PREFIX/table"
set -f # temporarily disable globbing so that our patterns aren't expanded
PRUNE_TEST="-name *test*.cc -prune"
PRUNE_BENCH="-name *_bench.cc -prune"
PRUNE_TOOL="-name leveldb_main.cc -prune"
PORTABLE_FILES=`find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_TOOL -o -name '*.cc' -print | sort | sed "s,^$PREFIX/,," | tr "\n" " "`
set +f # re-enable globbing
# The sources consist of the portable files, plus the platform-specific port
# file.
echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
if [ "$CROSS_COMPILE" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
true
else
CXXOUTPUT="${TMPDIR}/leveldb_build_detect_platform-cxx.$$"
# If -std=c++0x works, use <cstdatomic>. Otherwise use port_posix.h.
$CXX $CXXFLAGS -std=c++0x -x c++ - -o $CXXOUTPUT 2>/dev/null <<EOF
#include <cstdatomic>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
PLATFORM_CXXFLAGS="-std=c++0x"
else
COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
fi
# Test whether tcmalloc is available
$CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_LIBS="$PLATFORM_LIBS -ltcmalloc"
fi
rm -f $CXXOUTPUT 2>/dev/null
fi
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
echo "CC=$CC" >> $OUTPUT
echo "CXX=$CXX" >> $OUTPUT
echo "PLATFORM=$PLATFORM" >> $OUTPUT
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
echo "PLATFORM_LIBS=$PLATFORM_LIBS" >> $OUTPUT
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT

View File

@@ -0,0 +1,118 @@
// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h"
#include "db/db_impl.h"
#include "leveldb/cache.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
class AutoCompactTest {
public:
std::string dbname_;
Cache* tiny_cache_;
Options options_;
DB* db_;
AutoCompactTest() {
dbname_ = test::TmpDir() + "/autocompact_test";
tiny_cache_ = NewLRUCache(100);
options_.block_cache = tiny_cache_;
DestroyDB(dbname_, options_);
options_.create_if_missing = true;
options_.compression = kNoCompression;
ASSERT_OK(DB::Open(options_, dbname_, &db_));
}
~AutoCompactTest() {
delete db_;
DestroyDB(dbname_, Options());
delete tiny_cache_;
}
std::string Key(int i) {
char buf[100];
snprintf(buf, sizeof(buf), "key%06d", i);
return std::string(buf);
}
uint64_t Size(const Slice& start, const Slice& limit) {
Range r(start, limit);
uint64_t size;
db_->GetApproximateSizes(&r, 1, &size);
return size;
}
void DoReads(int n);
};
static const int kValueSize = 200 * 1024;
static const int kTotalSize = 100 * 1024 * 1024;
static const int kCount = kTotalSize / kValueSize;
// Read through the first n keys repeatedly and check that they get
// compacted (verified by checking the size of the key space).
void AutoCompactTest::DoReads(int n) {
std::string value(kValueSize, 'x');
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Fill database
for (int i = 0; i < kCount; i++) {
ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
}
ASSERT_OK(dbi->TEST_CompactMemTable());
// Delete everything
for (int i = 0; i < kCount; i++) {
ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
}
ASSERT_OK(dbi->TEST_CompactMemTable());
// Get initial measurement of the space we will be reading.
const int64_t initial_size = Size(Key(0), Key(n));
const int64_t initial_other_size = Size(Key(n), Key(kCount));
// Read until size drops significantly.
std::string limit_key = Key(n);
for (int read = 0; true; read++) {
ASSERT_LT(read, 100) << "Taking too long to compact";
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst();
iter->Valid() && iter->key().ToString() < limit_key;
iter->Next()) {
// Drop data
}
delete iter;
// Wait a little bit to allow any triggered compactions to complete.
Env::Default()->SleepForMicroseconds(1000000);
uint64_t size = Size(Key(0), Key(n));
fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
if (size <= initial_size/10) {
break;
}
}
// Verify that the size of the key space not touched by the reads
// is pretty much unchanged.
const int64_t final_other_size = Size(Key(n), Key(kCount));
ASSERT_LE(final_other_size, initial_other_size + 1048576);
ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
}
TEST(AutoCompactTest, ReadAll) {
DoReads(kCount);
}
TEST(AutoCompactTest, ReadHalf) {
DoReads(kCount/2);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

88
src/leveldb/db/builder.cc Normal file
View File

@@ -0,0 +1,88 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/builder.h"
#include "db/filename.h"
#include "db/dbformat.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/iterator.h"
namespace leveldb {
Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta) {
Status s;
meta->file_size = 0;
iter->SeekToFirst();
std::string fname = TableFileName(dbname, meta->number);
if (iter->Valid()) {
WritableFile* file;
s = env->NewWritableFile(fname, &file);
if (!s.ok()) {
return s;
}
TableBuilder* builder = new TableBuilder(options, file);
meta->smallest.DecodeFrom(iter->key());
for (; iter->Valid(); iter->Next()) {
Slice key = iter->key();
meta->largest.DecodeFrom(key);
builder->Add(key, iter->value());
}
// Finish and check for builder errors
if (s.ok()) {
s = builder->Finish();
if (s.ok()) {
meta->file_size = builder->FileSize();
assert(meta->file_size > 0);
}
} else {
builder->Abandon();
}
delete builder;
// Finish and check for file errors
if (s.ok()) {
s = file->Sync();
}
if (s.ok()) {
s = file->Close();
}
delete file;
file = NULL;
if (s.ok()) {
// Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(),
meta->number,
meta->file_size);
s = it->status();
delete it;
}
}
// Check for input iterator errors
if (!iter->status().ok()) {
s = iter->status();
}
if (s.ok() && meta->file_size > 0) {
// Keep it
} else {
env->DeleteFile(fname);
}
return s;
}
} // namespace leveldb

34
src/leveldb/db/builder.h Normal file
View File

@@ -0,0 +1,34 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
#define STORAGE_LEVELDB_DB_BUILDER_H_
#include "leveldb/status.h"
namespace leveldb {
struct Options;
struct FileMetaData;
class Env;
class Iterator;
class TableCache;
class VersionEdit;
// Build a Table file from the contents of *iter. The generated file
// will be named according to meta->number. On success, the rest of
// *meta will be filled with metadata about the generated table.
// If no data is present in *iter, meta->file_size will be set to
// zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname,
Env* env,
const Options& options,
TableCache* table_cache,
Iterator* iter,
FileMetaData* meta);
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_BUILDER_H_

595
src/leveldb/db/c.cc Normal file
View File

@@ -0,0 +1,595 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/c.h"
#include <stdlib.h>
#include <unistd.h>
#include "leveldb/cache.h"
#include "leveldb/comparator.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/filter_policy.h"
#include "leveldb/iterator.h"
#include "leveldb/options.h"
#include "leveldb/status.h"
#include "leveldb/write_batch.h"
using leveldb::Cache;
using leveldb::Comparator;
using leveldb::CompressionType;
using leveldb::DB;
using leveldb::Env;
using leveldb::FileLock;
using leveldb::FilterPolicy;
using leveldb::Iterator;
using leveldb::kMajorVersion;
using leveldb::kMinorVersion;
using leveldb::Logger;
using leveldb::NewBloomFilterPolicy;
using leveldb::NewLRUCache;
using leveldb::Options;
using leveldb::RandomAccessFile;
using leveldb::Range;
using leveldb::ReadOptions;
using leveldb::SequentialFile;
using leveldb::Slice;
using leveldb::Snapshot;
using leveldb::Status;
using leveldb::WritableFile;
using leveldb::WriteBatch;
using leveldb::WriteOptions;
extern "C" {
struct leveldb_t { DB* rep; };
struct leveldb_iterator_t { Iterator* rep; };
struct leveldb_writebatch_t { WriteBatch rep; };
struct leveldb_snapshot_t { const Snapshot* rep; };
struct leveldb_readoptions_t { ReadOptions rep; };
struct leveldb_writeoptions_t { WriteOptions rep; };
struct leveldb_options_t { Options rep; };
struct leveldb_cache_t { Cache* rep; };
struct leveldb_seqfile_t { SequentialFile* rep; };
struct leveldb_randomfile_t { RandomAccessFile* rep; };
struct leveldb_writablefile_t { WritableFile* rep; };
struct leveldb_logger_t { Logger* rep; };
struct leveldb_filelock_t { FileLock* rep; };
struct leveldb_comparator_t : public Comparator {
void* state_;
void (*destructor_)(void*);
int (*compare_)(
void*,
const char* a, size_t alen,
const char* b, size_t blen);
const char* (*name_)(void*);
virtual ~leveldb_comparator_t() {
(*destructor_)(state_);
}
virtual int Compare(const Slice& a, const Slice& b) const {
return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
}
virtual const char* Name() const {
return (*name_)(state_);
}
// No-ops since the C binding does not support key shortening methods.
virtual void FindShortestSeparator(std::string*, const Slice&) const { }
virtual void FindShortSuccessor(std::string* key) const { }
};
struct leveldb_filterpolicy_t : public FilterPolicy {
void* state_;
void (*destructor_)(void*);
const char* (*name_)(void*);
char* (*create_)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length);
unsigned char (*key_match_)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length);
virtual ~leveldb_filterpolicy_t() {
(*destructor_)(state_);
}
virtual const char* Name() const {
return (*name_)(state_);
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
std::vector<const char*> key_pointers(n);
std::vector<size_t> key_sizes(n);
for (int i = 0; i < n; i++) {
key_pointers[i] = keys[i].data();
key_sizes[i] = keys[i].size();
}
size_t len;
char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
dst->append(filter, len);
free(filter);
}
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return (*key_match_)(state_, key.data(), key.size(),
filter.data(), filter.size());
}
};
struct leveldb_env_t {
Env* rep;
bool is_default;
};
static bool SaveError(char** errptr, const Status& s) {
assert(errptr != NULL);
if (s.ok()) {
return false;
} else if (*errptr == NULL) {
*errptr = strdup(s.ToString().c_str());
} else {
// TODO(sanjay): Merge with existing error?
free(*errptr);
*errptr = strdup(s.ToString().c_str());
}
return true;
}
static char* CopyString(const std::string& str) {
char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
memcpy(result, str.data(), sizeof(char) * str.size());
return result;
}
leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr) {
DB* db;
if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
return NULL;
}
leveldb_t* result = new leveldb_t;
result->rep = db;
return result;
}
void leveldb_close(leveldb_t* db) {
delete db->rep;
delete db;
}
void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr) {
SaveError(errptr,
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
}
void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr) {
SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
}
void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr) {
SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
}
char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr) {
char* result = NULL;
std::string tmp;
Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
if (s.ok()) {
*vallen = tmp.size();
result = CopyString(tmp);
} else {
*vallen = 0;
if (!s.IsNotFound()) {
SaveError(errptr, s);
}
}
return result;
}
leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options) {
leveldb_iterator_t* result = new leveldb_iterator_t;
result->rep = db->rep->NewIterator(options->rep);
return result;
}
const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db) {
leveldb_snapshot_t* result = new leveldb_snapshot_t;
result->rep = db->rep->GetSnapshot();
return result;
}
void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot) {
db->rep->ReleaseSnapshot(snapshot->rep);
delete snapshot;
}
char* leveldb_property_value(
leveldb_t* db,
const char* propname) {
std::string tmp;
if (db->rep->GetProperty(Slice(propname), &tmp)) {
// We use strdup() since we expect human readable output.
return strdup(tmp.c_str());
} else {
return NULL;
}
}
void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes) {
Range* ranges = new Range[num_ranges];
for (int i = 0; i < num_ranges; i++) {
ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
}
db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
delete[] ranges;
}
void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len) {
Slice a, b;
db->rep->CompactRange(
// Pass NULL Slice if corresponding "const char*" is NULL
(start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
(limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
}
void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, DestroyDB(name, options->rep));
}
void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr) {
SaveError(errptr, RepairDB(name, options->rep));
}
void leveldb_iter_destroy(leveldb_iterator_t* iter) {
delete iter->rep;
delete iter;
}
unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
return iter->rep->Valid();
}
void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
iter->rep->SeekToFirst();
}
void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
iter->rep->SeekToLast();
}
void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
iter->rep->Seek(Slice(k, klen));
}
void leveldb_iter_next(leveldb_iterator_t* iter) {
iter->rep->Next();
}
void leveldb_iter_prev(leveldb_iterator_t* iter) {
iter->rep->Prev();
}
const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
Slice s = iter->rep->key();
*klen = s.size();
return s.data();
}
const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
Slice s = iter->rep->value();
*vlen = s.size();
return s.data();
}
void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
SaveError(errptr, iter->rep->status());
}
leveldb_writebatch_t* leveldb_writebatch_create() {
return new leveldb_writebatch_t;
}
void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
delete b;
}
void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
b->rep.Clear();
}
void leveldb_writebatch_put(
leveldb_writebatch_t* b,
const char* key, size_t klen,
const char* val, size_t vlen) {
b->rep.Put(Slice(key, klen), Slice(val, vlen));
}
void leveldb_writebatch_delete(
leveldb_writebatch_t* b,
const char* key, size_t klen) {
b->rep.Delete(Slice(key, klen));
}
void leveldb_writebatch_iterate(
leveldb_writebatch_t* b,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen)) {
class H : public WriteBatch::Handler {
public:
void* state_;
void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
void (*deleted_)(void*, const char* k, size_t klen);
virtual void Put(const Slice& key, const Slice& value) {
(*put_)(state_, key.data(), key.size(), value.data(), value.size());
}
virtual void Delete(const Slice& key) {
(*deleted_)(state_, key.data(), key.size());
}
};
H handler;
handler.state_ = state;
handler.put_ = put;
handler.deleted_ = deleted;
b->rep.Iterate(&handler);
}
leveldb_options_t* leveldb_options_create() {
return new leveldb_options_t;
}
void leveldb_options_destroy(leveldb_options_t* options) {
delete options;
}
void leveldb_options_set_comparator(
leveldb_options_t* opt,
leveldb_comparator_t* cmp) {
opt->rep.comparator = cmp;
}
void leveldb_options_set_filter_policy(
leveldb_options_t* opt,
leveldb_filterpolicy_t* policy) {
opt->rep.filter_policy = policy;
}
void leveldb_options_set_create_if_missing(
leveldb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v;
}
void leveldb_options_set_error_if_exists(
leveldb_options_t* opt, unsigned char v) {
opt->rep.error_if_exists = v;
}
void leveldb_options_set_paranoid_checks(
leveldb_options_t* opt, unsigned char v) {
opt->rep.paranoid_checks = v;
}
void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
opt->rep.env = (env ? env->rep : NULL);
}
void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
opt->rep.info_log = (l ? l->rep : NULL);
}
void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
opt->rep.write_buffer_size = s;
}
void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
opt->rep.max_open_files = n;
}
void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
opt->rep.block_cache = c->rep;
}
void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
opt->rep.block_size = s;
}
void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
opt->rep.block_restart_interval = n;
}
void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
opt->rep.compression = static_cast<CompressionType>(t);
}
leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*)) {
leveldb_comparator_t* result = new leveldb_comparator_t;
result->state_ = state;
result->destructor_ = destructor;
result->compare_ = compare;
result->name_ = name;
return result;
}
void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
delete cmp;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*)) {
leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
result->state_ = state;
result->destructor_ = destructor;
result->create_ = create_filter;
result->key_match_ = key_may_match;
result->name_ = name;
return result;
}
void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
delete filter;
}
leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
// Make a leveldb_filterpolicy_t, but override all of its methods so
// they delegate to a NewBloomFilterPolicy() instead of user
// supplied C functions.
struct Wrapper : public leveldb_filterpolicy_t {
const FilterPolicy* rep_;
~Wrapper() { delete rep_; }
const char* Name() const { return rep_->Name(); }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
return rep_->CreateFilter(keys, n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
return rep_->KeyMayMatch(key, filter);
}
static void DoNothing(void*) { }
};
Wrapper* wrapper = new Wrapper;
wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
wrapper->state_ = NULL;
wrapper->destructor_ = &Wrapper::DoNothing;
return wrapper;
}
leveldb_readoptions_t* leveldb_readoptions_create() {
return new leveldb_readoptions_t;
}
void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
delete opt;
}
void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t* opt,
unsigned char v) {
opt->rep.verify_checksums = v;
}
void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t* opt, unsigned char v) {
opt->rep.fill_cache = v;
}
void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t* opt,
const leveldb_snapshot_t* snap) {
opt->rep.snapshot = (snap ? snap->rep : NULL);
}
leveldb_writeoptions_t* leveldb_writeoptions_create() {
return new leveldb_writeoptions_t;
}
void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
delete opt;
}
void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t* opt, unsigned char v) {
opt->rep.sync = v;
}
leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
leveldb_cache_t* c = new leveldb_cache_t;
c->rep = NewLRUCache(capacity);
return c;
}
void leveldb_cache_destroy(leveldb_cache_t* cache) {
delete cache->rep;
delete cache;
}
leveldb_env_t* leveldb_create_default_env() {
leveldb_env_t* result = new leveldb_env_t;
result->rep = Env::Default();
result->is_default = true;
return result;
}
void leveldb_env_destroy(leveldb_env_t* env) {
if (!env->is_default) delete env->rep;
delete env;
}
void leveldb_free(void* ptr) {
free(ptr);
}
int leveldb_major_version() {
return kMajorVersion;
}
int leveldb_minor_version() {
return kMinorVersion;
}
} // end extern "C"

390
src/leveldb/db/c_test.c Normal file
View File

@@ -0,0 +1,390 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors. */
#include "leveldb/c.h"
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
const char* phase = "";
static char dbname[200];
static void StartPhase(const char* name) {
fprintf(stderr, "=== Test %s\n", name);
phase = name;
}
static const char* GetTempDir(void) {
const char* ret = getenv("TEST_TMPDIR");
if (ret == NULL || ret[0] == '\0')
ret = "/tmp";
return ret;
}
#define CheckNoError(err) \
if ((err) != NULL) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
abort(); \
}
#define CheckCondition(cond) \
if (!(cond)) { \
fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
abort(); \
}
static void CheckEqual(const char* expected, const char* v, size_t n) {
if (expected == NULL && v == NULL) {
// ok
} else if (expected != NULL && v != NULL && n == strlen(expected) &&
memcmp(expected, v, n) == 0) {
// ok
return;
} else {
fprintf(stderr, "%s: expected '%s', got '%s'\n",
phase,
(expected ? expected : "(null)"),
(v ? v : "(null"));
abort();
}
}
static void Free(char** ptr) {
if (*ptr) {
free(*ptr);
*ptr = NULL;
}
}
static void CheckGet(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckIter(leveldb_iterator_t* iter,
const char* key, const char* val) {
size_t len;
const char* str;
str = leveldb_iter_key(iter, &len);
CheckEqual(key, str, len);
str = leveldb_iter_value(iter, &len);
CheckEqual(val, str, len);
}
// Callback from leveldb_writebatch_iterate()
static void CheckPut(void* ptr,
const char* k, size_t klen,
const char* v, size_t vlen) {
int* state = (int*) ptr;
CheckCondition(*state < 2);
switch (*state) {
case 0:
CheckEqual("bar", k, klen);
CheckEqual("b", v, vlen);
break;
case 1:
CheckEqual("box", k, klen);
CheckEqual("c", v, vlen);
break;
}
(*state)++;
}
// Callback from leveldb_writebatch_iterate()
static void CheckDel(void* ptr, const char* k, size_t klen) {
int* state = (int*) ptr;
CheckCondition(*state == 2);
CheckEqual("bar", k, klen);
(*state)++;
}
static void CmpDestroy(void* arg) { }
static int CmpCompare(void* arg, const char* a, size_t alen,
const char* b, size_t blen) {
int n = (alen < blen) ? alen : blen;
int r = memcmp(a, b, n);
if (r == 0) {
if (alen < blen) r = -1;
else if (alen > blen) r = +1;
}
return r;
}
static const char* CmpName(void* arg) {
return "foo";
}
// Custom filter policy
static unsigned char fake_filter_result = 1;
static void FilterDestroy(void* arg) { }
static const char* FilterName(void* arg) {
return "TestFilter";
}
static char* FilterCreate(
void* arg,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length) {
*filter_length = 4;
char* result = malloc(4);
memcpy(result, "fake", 4);
return result;
}
unsigned char FilterKeyMatch(
void* arg,
const char* key, size_t length,
const char* filter, size_t filter_length) {
CheckCondition(filter_length == 4);
CheckCondition(memcmp(filter, "fake", 4) == 0);
return fake_filter_result;
}
int main(int argc, char** argv) {
leveldb_t* db;
leveldb_comparator_t* cmp;
leveldb_cache_t* cache;
leveldb_env_t* env;
leveldb_options_t* options;
leveldb_readoptions_t* roptions;
leveldb_writeoptions_t* woptions;
char* err = NULL;
int run = -1;
CheckCondition(leveldb_major_version() >= 1);
CheckCondition(leveldb_minor_version() >= 1);
snprintf(dbname, sizeof(dbname),
"%s/leveldb_c_test-%d",
GetTempDir(),
((int) geteuid()));
StartPhase("create_objects");
cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
env = leveldb_create_default_env();
cache = leveldb_cache_create_lru(100000);
options = leveldb_options_create();
leveldb_options_set_comparator(options, cmp);
leveldb_options_set_error_if_exists(options, 1);
leveldb_options_set_cache(options, cache);
leveldb_options_set_env(options, env);
leveldb_options_set_info_log(options, NULL);
leveldb_options_set_write_buffer_size(options, 100000);
leveldb_options_set_paranoid_checks(options, 1);
leveldb_options_set_max_open_files(options, 10);
leveldb_options_set_block_size(options, 1024);
leveldb_options_set_block_restart_interval(options, 8);
leveldb_options_set_compression(options, leveldb_no_compression);
roptions = leveldb_readoptions_create();
leveldb_readoptions_set_verify_checksums(roptions, 1);
leveldb_readoptions_set_fill_cache(roptions, 0);
woptions = leveldb_writeoptions_create();
leveldb_writeoptions_set_sync(woptions, 1);
StartPhase("destroy");
leveldb_destroy_db(options, dbname, &err);
Free(&err);
StartPhase("open_error");
db = leveldb_open(options, dbname, &err);
CheckCondition(err != NULL);
Free(&err);
StartPhase("leveldb_free");
db = leveldb_open(options, dbname, &err);
CheckCondition(err != NULL);
leveldb_free(err);
err = NULL;
StartPhase("open");
leveldb_options_set_create_if_missing(options, 1);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
StartPhase("put");
leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactall");
leveldb_compact_range(db, NULL, 0, NULL, 0);
CheckGet(db, roptions, "foo", "hello");
StartPhase("compactrange");
leveldb_compact_range(db, "a", 1, "z", 1);
CheckGet(db, roptions, "foo", "hello");
StartPhase("writebatch");
{
leveldb_writebatch_t* wb = leveldb_writebatch_create();
leveldb_writebatch_put(wb, "foo", 3, "a", 1);
leveldb_writebatch_clear(wb);
leveldb_writebatch_put(wb, "bar", 3, "b", 1);
leveldb_writebatch_put(wb, "box", 3, "c", 1);
leveldb_writebatch_delete(wb, "bar", 3);
leveldb_write(db, woptions, wb, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", "hello");
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
int pos = 0;
leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
CheckCondition(pos == 3);
leveldb_writebatch_destroy(wb);
}
StartPhase("iter");
{
leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_first(iter);
CheckCondition(leveldb_iter_valid(iter));
CheckIter(iter, "box", "c");
leveldb_iter_next(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_prev(iter);
CheckIter(iter, "box", "c");
leveldb_iter_prev(iter);
CheckCondition(!leveldb_iter_valid(iter));
leveldb_iter_seek_to_last(iter);
CheckIter(iter, "foo", "hello");
leveldb_iter_seek(iter, "b", 1);
CheckIter(iter, "box", "c");
leveldb_iter_get_error(iter, &err);
CheckNoError(err);
leveldb_iter_destroy(iter);
}
StartPhase("approximate_sizes");
{
int i;
int n = 20000;
char keybuf[100];
char valbuf[100];
uint64_t sizes[2];
const char* start[2] = { "a", "k00000000000000010000" };
size_t start_len[2] = { 1, 21 };
const char* limit[2] = { "k00000000000000010000", "z" };
size_t limit_len[2] = { 21, 1 };
leveldb_writeoptions_set_sync(woptions, 0);
for (i = 0; i < n; i++) {
snprintf(keybuf, sizeof(keybuf), "k%020d", i);
snprintf(valbuf, sizeof(valbuf), "v%020d", i);
leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
&err);
CheckNoError(err);
}
leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
CheckCondition(sizes[0] > 0);
CheckCondition(sizes[1] > 0);
}
StartPhase("property");
{
char* prop = leveldb_property_value(db, "nosuchprop");
CheckCondition(prop == NULL);
prop = leveldb_property_value(db, "leveldb.stats");
CheckCondition(prop != NULL);
Free(&prop);
}
StartPhase("snapshot");
{
const leveldb_snapshot_t* snap;
snap = leveldb_create_snapshot(db);
leveldb_delete(db, woptions, "foo", 3, &err);
CheckNoError(err);
leveldb_readoptions_set_snapshot(roptions, snap);
CheckGet(db, roptions, "foo", "hello");
leveldb_readoptions_set_snapshot(roptions, NULL);
CheckGet(db, roptions, "foo", NULL);
leveldb_release_snapshot(db, snap);
}
StartPhase("repair");
{
leveldb_close(db);
leveldb_options_set_create_if_missing(options, 0);
leveldb_options_set_error_if_exists(options, 0);
leveldb_repair_db(options, dbname, &err);
CheckNoError(err);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
CheckGet(db, roptions, "box", "c");
leveldb_options_set_create_if_missing(options, 1);
leveldb_options_set_error_if_exists(options, 1);
}
StartPhase("filter");
for (run = 0; run < 2; run++) {
// First run uses custom filter, second run uses bloom filter
CheckNoError(err);
leveldb_filterpolicy_t* policy;
if (run == 0) {
policy = leveldb_filterpolicy_create(
NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
} else {
policy = leveldb_filterpolicy_create_bloom(10);
}
// Create new database
leveldb_close(db);
leveldb_destroy_db(options, dbname, &err);
leveldb_options_set_filter_policy(options, policy);
db = leveldb_open(options, dbname, &err);
CheckNoError(err);
leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
CheckNoError(err);
leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
CheckNoError(err);
leveldb_compact_range(db, NULL, 0, NULL, 0);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
if (phase == 0) {
// Must not find value when custom filter returns false
fake_filter_result = 0;
CheckGet(db, roptions, "foo", NULL);
CheckGet(db, roptions, "bar", NULL);
fake_filter_result = 1;
CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue");
}
leveldb_options_set_filter_policy(options, NULL);
leveldb_filterpolicy_destroy(policy);
}
StartPhase("cleanup");
leveldb_close(db);
leveldb_options_destroy(options);
leveldb_readoptions_destroy(roptions);
leveldb_writeoptions_destroy(woptions);
leveldb_cache_destroy(cache);
leveldb_comparator_destroy(cmp);
leveldb_env_destroy(env);
fprintf(stderr, "PASS\n");
return 0;
}

View File

@@ -0,0 +1,352 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h"
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "leveldb/cache.h"
#include "leveldb/env.h"
#include "leveldb/table.h"
#include "leveldb/write_batch.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
static const int kValueSize = 1000;
class CorruptionTest {
public:
test::ErrorEnv env_;
std::string dbname_;
Cache* tiny_cache_;
Options options_;
DB* db_;
CorruptionTest() {
tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
options_.block_cache = tiny_cache_;
dbname_ = test::TmpDir() + "/db_test";
DestroyDB(dbname_, options_);
db_ = NULL;
options_.create_if_missing = true;
Reopen();
options_.create_if_missing = false;
}
~CorruptionTest() {
delete db_;
DestroyDB(dbname_, Options());
delete tiny_cache_;
}
Status TryReopen() {
delete db_;
db_ = NULL;
return DB::Open(options_, dbname_, &db_);
}
void Reopen() {
ASSERT_OK(TryReopen());
}
void RepairDB() {
delete db_;
db_ = NULL;
ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
}
void Build(int n) {
std::string key_space, value_space;
WriteBatch batch;
for (int i = 0; i < n; i++) {
//if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
Slice key = Key(i, &key_space);
batch.Clear();
batch.Put(key, Value(i, &value_space));
ASSERT_OK(db_->Write(WriteOptions(), &batch));
}
}
void Check(int min_expected, int max_expected) {
int next_expected = 0;
int missed = 0;
int bad_keys = 0;
int bad_values = 0;
int correct = 0;
std::string value_space;
Iterator* iter = db_->NewIterator(ReadOptions());
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
uint64_t key;
Slice in(iter->key());
if (in == "" || in == "~") {
// Ignore boundary keys.
continue;
}
if (!ConsumeDecimalNumber(&in, &key) ||
!in.empty() ||
key < next_expected) {
bad_keys++;
continue;
}
missed += (key - next_expected);
next_expected = key + 1;
if (iter->value() != Value(key, &value_space)) {
bad_values++;
} else {
correct++;
}
}
delete iter;
fprintf(stderr,
"expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
min_expected, max_expected, correct, bad_keys, bad_values, missed);
ASSERT_LE(min_expected, correct);
ASSERT_GE(max_expected, correct);
}
void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
// Pick file to corrupt
std::vector<std::string> filenames;
ASSERT_OK(env_.GetChildren(dbname_, &filenames));
uint64_t number;
FileType type;
std::string fname;
int picked_number = -1;
for (int i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type) &&
type == filetype &&
int(number) > picked_number) { // Pick latest file
fname = dbname_ + "/" + filenames[i];
picked_number = number;
}
}
ASSERT_TRUE(!fname.empty()) << filetype;
struct stat sbuf;
if (stat(fname.c_str(), &sbuf) != 0) {
const char* msg = strerror(errno);
ASSERT_TRUE(false) << fname << ": " << msg;
}
if (offset < 0) {
// Relative to end of file; make it absolute
if (-offset > sbuf.st_size) {
offset = 0;
} else {
offset = sbuf.st_size + offset;
}
}
if (offset > sbuf.st_size) {
offset = sbuf.st_size;
}
if (offset + bytes_to_corrupt > sbuf.st_size) {
bytes_to_corrupt = sbuf.st_size - offset;
}
// Do it
std::string contents;
Status s = ReadFileToString(Env::Default(), fname, &contents);
ASSERT_TRUE(s.ok()) << s.ToString();
for (int i = 0; i < bytes_to_corrupt; i++) {
contents[i + offset] ^= 0x80;
}
s = WriteStringToFile(Env::Default(), contents, fname);
ASSERT_TRUE(s.ok()) << s.ToString();
}
int Property(const std::string& name) {
std::string property;
int result;
if (db_->GetProperty(name, &property) &&
sscanf(property.c_str(), "%d", &result) == 1) {
return result;
} else {
return -1;
}
}
// Return the ith key
Slice Key(int i, std::string* storage) {
char buf[100];
snprintf(buf, sizeof(buf), "%016d", i);
storage->assign(buf, strlen(buf));
return Slice(*storage);
}
// Return the value to associate with the specified key
Slice Value(int k, std::string* storage) {
Random r(k);
return test::RandomString(&r, kValueSize, storage);
}
};
TEST(CorruptionTest, Recovery) {
Build(100);
Check(100, 100);
Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
Reopen();
// The 64 records in the first two log blocks are completely lost.
Check(36, 36);
}
TEST(CorruptionTest, RecoverWriteError) {
env_.writable_file_error_ = true;
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
}
TEST(CorruptionTest, NewFileErrorDuringWrite) {
// Do enough writing to force minor compaction
env_.writable_file_error_ = true;
const int num = 3 + (Options().write_buffer_size / kValueSize);
std::string value_storage;
Status s;
for (int i = 0; s.ok() && i < num; i++) {
WriteBatch batch;
batch.Put("a", Value(100, &value_storage));
s = db_->Write(WriteOptions(), &batch);
}
ASSERT_TRUE(!s.ok());
ASSERT_GE(env_.num_writable_file_errors_, 1);
env_.writable_file_error_ = false;
Reopen();
}
TEST(CorruptionTest, TableFile) {
Build(100);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, NULL, NULL);
dbi->TEST_CompactRange(1, NULL, NULL);
Corrupt(kTableFile, 100, 1);
Check(90, 99);
}
TEST(CorruptionTest, TableFileIndexData) {
Build(10000); // Enough to build multiple Tables
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, -2000, 500);
Reopen();
Check(5000, 9999);
}
TEST(CorruptionTest, MissingDescriptor) {
Build(1000);
RepairDB();
Reopen();
Check(1000, 1000);
}
TEST(CorruptionTest, SequenceNumberRecovery) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v5", v);
// Write something. If sequence number was not recovered properly,
// it will be hidden by an earlier write.
ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
Reopen();
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("v6", v);
}
TEST(CorruptionTest, CorruptedDescriptor) {
ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
dbi->TEST_CompactRange(0, NULL, NULL);
Corrupt(kDescriptorFile, 0, 1000);
Status s = TryReopen();
ASSERT_TRUE(!s.ok());
RepairDB();
Reopen();
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
ASSERT_EQ("hello", v);
}
TEST(CorruptionTest, CompactionInputError) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
const int last = config::kMaxMemCompactLevel;
ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
Corrupt(kTableFile, 100, 1);
Check(5, 9);
// Force compactions by writing lots of values
Build(10000);
Check(10000, 10000);
}
TEST(CorruptionTest, CompactionInputErrorParanoid) {
options_.paranoid_checks = true;
options_.write_buffer_size = 512 << 10;
Reopen();
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
// Make multiple inputs so we need to compact.
for (int i = 0; i < 2; i++) {
Build(10);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1);
env_.SleepForMicroseconds(100000);
}
dbi->CompactRange(NULL, NULL);
// Write must fail because of corrupted table
std::string tmp1, tmp2;
Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
}
TEST(CorruptionTest, UnrelatedKeys) {
Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_CompactMemTable();
Corrupt(kTableFile, 100, 1);
std::string tmp1, tmp2;
ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
std::string v;
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
dbi->TEST_CompactMemTable();
ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

979
src/leveldb/db/db_bench.cc Normal file
View File

@@ -0,0 +1,979 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include "db/db_impl.h"
#include "db/version_set.h"
#include "leveldb/cache.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/write_batch.h"
#include "port/port.h"
#include "util/crc32c.h"
#include "util/histogram.h"
#include "util/mutexlock.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
// fillseq -- write N values in sequential key order in async mode
// fillrandom -- write N values in random key order in async mode
// overwrite -- overwrite N values in random key order in async mode
// fillsync -- write N/100 values in random key order in sync mode
// fill100K -- write N/1000 100K values in random order in async mode
// deleteseq -- delete N keys in sequential order
// deleterandom -- delete N keys in random order
// readseq -- read N times sequentially
// readreverse -- read N times in reverse order
// readrandom -- read N times in random order
// readmissing -- read N missing keys in random order
// readhot -- read N times in random order from 1% section of DB
// seekrandom -- N random seeks
// crc32c -- repeated crc32c of 4K of data
// acquireload -- load N*1000 times
// Meta operations:
// compact -- Compact the entire DB
// stats -- Print DB stats
// sstables -- Print sstable info
// heapprofile -- Dump a heap profile (if supported by this port)
static const char* FLAGS_benchmarks =
"fillseq,"
"fillsync,"
"fillrandom,"
"overwrite,"
"readrandom,"
"readrandom," // Extra run to allow previous compactions to quiesce
"readseq,"
"readreverse,"
"compact,"
"readrandom,"
"readseq,"
"readreverse,"
"fill100K,"
"crc32c,"
"snappycomp,"
"snappyuncomp,"
"acquireload,"
;
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;
// Number of concurrent threads to run.
static int FLAGS_threads = 1;
// Size of each value
static int FLAGS_value_size = 100;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.5;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Number of bytes to buffer in memtable before compacting
// (initialized to default value by "main")
static int FLAGS_write_buffer_size = 0;
// Number of bytes to use as a cache of uncompressed data.
// Negative means use default settings.
static int FLAGS_cache_size = -1;
// Maximum number of files to keep open at the same time (use default if == 0)
static int FLAGS_open_files = 0;
// Bloom filter bits per key.
// Negative means use default settings.
static int FLAGS_bloom_bits = -1;
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
// Use the db with the following name.
static const char* FLAGS_db = NULL;
namespace leveldb {
namespace {
// Helper for quickly generating random data.
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
static Slice TrimSpace(Slice s) {
int start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
int limit = s.size();
while (limit > start && isspace(s[limit-1])) {
limit--;
}
return Slice(s.data() + start, limit - start);
}
static void AppendWithSpace(std::string* str, Slice msg) {
if (msg.empty()) return;
if (!str->empty()) {
str->push_back(' ');
}
str->append(msg.data(), msg.size());
}
class Stats {
private:
double start_;
double finish_;
double seconds_;
int done_;
int next_report_;
int64_t bytes_;
double last_op_finish_;
Histogram hist_;
std::string message_;
public:
Stats() { Start(); }
void Start() {
next_report_ = 100;
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
bytes_ = 0;
seconds_ = 0;
start_ = Env::Default()->NowMicros();
finish_ = start_;
message_.clear();
}
void Merge(const Stats& other) {
hist_.Merge(other.hist_);
done_ += other.done_;
bytes_ += other.bytes_;
seconds_ += other.seconds_;
if (other.start_ < start_) start_ = other.start_;
if (other.finish_ > finish_) finish_ = other.finish_;
// Just keep the messages from one thread
if (message_.empty()) message_ = other.message_;
}
void Stop() {
finish_ = Env::Default()->NowMicros();
seconds_ = (finish_ - start_) * 1e-6;
}
void AddMessage(Slice msg) {
AppendWithSpace(&message_, msg);
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros();
double micros = now - last_op_finish_;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void AddBytes(int64_t n) {
bytes_ += n;
}
void Report(const Slice& name) {
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedSingleOp().
if (done_ < 1) done_ = 1;
std::string extra;
if (bytes_ > 0) {
// Rate is computed on actual elapsed time, not the sum of per-thread
// elapsed times.
double elapsed = (finish_ - start_) * 1e-6;
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / elapsed);
extra = rate;
}
AppendWithSpace(&extra, message_);
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
seconds_ * 1e6 / done_,
(extra.empty() ? "" : " "),
extra.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
};
// State shared by all concurrent executions of the same benchmark.
struct SharedState {
port::Mutex mu;
port::CondVar cv;
int total;
// Each thread goes through the following states:
// (1) initializing
// (2) waiting for others to be initialized
// (3) running
// (4) done
int num_initialized;
int num_done;
bool start;
SharedState() : cv(&mu) { }
};
// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
int tid; // 0..n-1 when running in n threads
Random rand; // Has different seeds for different threads
Stats stats;
SharedState* shared;
ThreadState(int index)
: tid(index),
rand(1000 + index) {
}
};
} // namespace
class Benchmark {
private:
Cache* cache_;
const FilterPolicy* filter_policy_;
DB* db_;
int num_;
int value_size_;
int entries_per_batch_;
WriteOptions write_options_;
int reads_;
int heap_counter_;
void PrintHeader() {
const int kKeySize = 16;
PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
FLAGS_value_size,
static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
fprintf(stdout, "Entries: %d\n", num_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
/ 1048576.0));
fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
(((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
/ 1048576.0));
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
}
void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
);
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
// See if snappy is working by attempting to compress a compressible string
const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy";
std::string compressed;
if (!port::Snappy_Compress(text, sizeof(text), &compressed)) {
fprintf(stdout, "WARNING: Snappy compression is not enabled\n");
} else if (compressed.size() >= sizeof(text)) {
fprintf(stdout, "WARNING: Snappy compression is not effective\n");
}
}
void PrintEnvironment() {
fprintf(stderr, "LevelDB: version %d.%d\n",
kMajorVersion, kMinorVersion);
#if defined(__linux)
time_t now = time(NULL);
fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo != NULL) {
char line[1000];
int num_cpus = 0;
std::string cpu_type;
std::string cache_size;
while (fgets(line, sizeof(line), cpuinfo) != NULL) {
const char* sep = strchr(line, ':');
if (sep == NULL) {
continue;
}
Slice key = TrimSpace(Slice(line, sep - 1 - line));
Slice val = TrimSpace(Slice(sep + 1));
if (key == "model name") {
++num_cpus;
cpu_type = val.ToString();
} else if (key == "cache size") {
cache_size = val.ToString();
}
}
fclose(cpuinfo);
fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
}
#endif
}
public:
Benchmark()
: cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
filter_policy_(FLAGS_bloom_bits >= 0
? NewBloomFilterPolicy(FLAGS_bloom_bits)
: NULL),
db_(NULL),
num_(FLAGS_num),
value_size_(FLAGS_value_size),
entries_per_batch_(1),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
heap_counter_(0) {
std::vector<std::string> files;
Env::Default()->GetChildren(FLAGS_db, &files);
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("heap-")) {
Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
}
}
if (!FLAGS_use_existing_db) {
DestroyDB(FLAGS_db, Options());
}
}
~Benchmark() {
delete db_;
delete cache_;
delete filter_policy_;
}
void Run() {
PrintHeader();
Open();
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
// Reset parameters that may be overriddden bwlow
num_ = FLAGS_num;
reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
value_size_ = FLAGS_value_size;
entries_per_batch_ = 1;
write_options_ = WriteOptions();
void (Benchmark::*method)(ThreadState*) = NULL;
bool fresh_db = false;
int num_threads = FLAGS_threads;
if (name == Slice("fillseq")) {
fresh_db = true;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillbatch")) {
fresh_db = true;
entries_per_batch_ = 1000;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillrandom")) {
fresh_db = true;
method = &Benchmark::WriteRandom;
} else if (name == Slice("overwrite")) {
fresh_db = false;
method = &Benchmark::WriteRandom;
} else if (name == Slice("fillsync")) {
fresh_db = true;
num_ /= 1000;
write_options_.sync = true;
method = &Benchmark::WriteRandom;
} else if (name == Slice("fill100K")) {
fresh_db = true;
num_ /= 1000;
value_size_ = 100 * 1000;
method = &Benchmark::WriteRandom;
} else if (name == Slice("readseq")) {
method = &Benchmark::ReadSequential;
} else if (name == Slice("readreverse")) {
method = &Benchmark::ReadReverse;
} else if (name == Slice("readrandom")) {
method = &Benchmark::ReadRandom;
} else if (name == Slice("readmissing")) {
method = &Benchmark::ReadMissing;
} else if (name == Slice("seekrandom")) {
method = &Benchmark::SeekRandom;
} else if (name == Slice("readhot")) {
method = &Benchmark::ReadHot;
} else if (name == Slice("readrandomsmall")) {
reads_ /= 1000;
method = &Benchmark::ReadRandom;
} else if (name == Slice("deleteseq")) {
method = &Benchmark::DeleteSeq;
} else if (name == Slice("deleterandom")) {
method = &Benchmark::DeleteRandom;
} else if (name == Slice("readwhilewriting")) {
num_threads++; // Add extra thread for writing
method = &Benchmark::ReadWhileWriting;
} else if (name == Slice("compact")) {
method = &Benchmark::Compact;
} else if (name == Slice("crc32c")) {
method = &Benchmark::Crc32c;
} else if (name == Slice("acquireload")) {
method = &Benchmark::AcquireLoad;
} else if (name == Slice("snappycomp")) {
method = &Benchmark::SnappyCompress;
} else if (name == Slice("snappyuncomp")) {
method = &Benchmark::SnappyUncompress;
} else if (name == Slice("heapprofile")) {
HeapProfile();
} else if (name == Slice("stats")) {
PrintStats("leveldb.stats");
} else if (name == Slice("sstables")) {
PrintStats("leveldb.sstables");
} else {
if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
}
if (fresh_db) {
if (FLAGS_use_existing_db) {
fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
name.ToString().c_str());
method = NULL;
} else {
delete db_;
db_ = NULL;
DestroyDB(FLAGS_db, Options());
Open();
}
}
if (method != NULL) {
RunBenchmark(num_threads, name, method);
}
}
}
private:
struct ThreadArg {
Benchmark* bm;
SharedState* shared;
ThreadState* thread;
void (Benchmark::*method)(ThreadState*);
};
static void ThreadBody(void* v) {
ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
SharedState* shared = arg->shared;
ThreadState* thread = arg->thread;
{
MutexLock l(&shared->mu);
shared->num_initialized++;
if (shared->num_initialized >= shared->total) {
shared->cv.SignalAll();
}
while (!shared->start) {
shared->cv.Wait();
}
}
thread->stats.Start();
(arg->bm->*(arg->method))(thread);
thread->stats.Stop();
{
MutexLock l(&shared->mu);
shared->num_done++;
if (shared->num_done >= shared->total) {
shared->cv.SignalAll();
}
}
}
void RunBenchmark(int n, Slice name,
void (Benchmark::*method)(ThreadState*)) {
SharedState shared;
shared.total = n;
shared.num_initialized = 0;
shared.num_done = 0;
shared.start = false;
ThreadArg* arg = new ThreadArg[n];
for (int i = 0; i < n; i++) {
arg[i].bm = this;
arg[i].method = method;
arg[i].shared = &shared;
arg[i].thread = new ThreadState(i);
arg[i].thread->shared = &shared;
Env::Default()->StartThread(ThreadBody, &arg[i]);
}
shared.mu.Lock();
while (shared.num_initialized < n) {
shared.cv.Wait();
}
shared.start = true;
shared.cv.SignalAll();
while (shared.num_done < n) {
shared.cv.Wait();
}
shared.mu.Unlock();
for (int i = 1; i < n; i++) {
arg[0].thread->stats.Merge(arg[i].thread->stats);
}
arg[0].thread->stats.Report(name);
for (int i = 0; i < n; i++) {
delete arg[i].thread;
}
delete[] arg;
}
void Crc32c(ThreadState* thread) {
// Checksum about 500MB of data total
const int size = 4096;
const char* label = "(4K per op)";
std::string data(size, 'x');
int64_t bytes = 0;
uint32_t crc = 0;
while (bytes < 500 * 1048576) {
crc = crc32c::Value(data.data(), size);
thread->stats.FinishedSingleOp();
bytes += size;
}
// Print so result is not dead
fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
thread->stats.AddBytes(bytes);
thread->stats.AddMessage(label);
}
void AcquireLoad(ThreadState* thread) {
int dummy;
port::AtomicPointer ap(&dummy);
int count = 0;
void *ptr = NULL;
thread->stats.AddMessage("(each op is 1000 loads)");
while (count < 100000) {
for (int i = 0; i < 1000; i++) {
ptr = ap.Acquire_Load();
}
count++;
thread->stats.FinishedSingleOp();
}
if (ptr == NULL) exit(1); // Disable unused variable warning.
}
void SnappyCompress(ThreadState* thread) {
RandomGenerator gen;
Slice input = gen.Generate(Options().block_size);
int64_t bytes = 0;
int64_t produced = 0;
bool ok = true;
std::string compressed;
while (ok && bytes < 1024 * 1048576) { // Compress 1G
ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
produced += compressed.size();
bytes += input.size();
thread->stats.FinishedSingleOp();
}
if (!ok) {
thread->stats.AddMessage("(snappy failure)");
} else {
char buf[100];
snprintf(buf, sizeof(buf), "(output: %.1f%%)",
(produced * 100.0) / bytes);
thread->stats.AddMessage(buf);
thread->stats.AddBytes(bytes);
}
}
void SnappyUncompress(ThreadState* thread) {
RandomGenerator gen;
Slice input = gen.Generate(Options().block_size);
std::string compressed;
bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
int64_t bytes = 0;
char* uncompressed = new char[input.size()];
while (ok && bytes < 1024 * 1048576) { // Compress 1G
ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
uncompressed);
bytes += input.size();
thread->stats.FinishedSingleOp();
}
delete[] uncompressed;
if (!ok) {
thread->stats.AddMessage("(snappy failure)");
} else {
thread->stats.AddBytes(bytes);
}
}
void Open() {
assert(db_ == NULL);
Options options;
options.create_if_missing = !FLAGS_use_existing_db;
options.block_cache = cache_;
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_open_files = FLAGS_open_files;
options.filter_policy = filter_policy_;
Status s = DB::Open(options, FLAGS_db, &db_);
if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
exit(1);
}
}
void WriteSeq(ThreadState* thread) {
DoWrite(thread, true);
}
void WriteRandom(ThreadState* thread) {
DoWrite(thread, false);
}
void DoWrite(ThreadState* thread, bool seq) {
if (num_ != FLAGS_num) {
char msg[100];
snprintf(msg, sizeof(msg), "(%d ops)", num_);
thread->stats.AddMessage(msg);
}
RandomGenerator gen;
WriteBatch batch;
Status s;
int64_t bytes = 0;
for (int i = 0; i < num_; i += entries_per_batch_) {
batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) {
const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
batch.Put(key, gen.Generate(value_size_));
bytes += value_size_ + strlen(key);
thread->stats.FinishedSingleOp();
}
s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
}
thread->stats.AddBytes(bytes);
}
void ReadSequential(ThreadState* thread) {
Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0;
int64_t bytes = 0;
for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
bytes += iter->key().size() + iter->value().size();
thread->stats.FinishedSingleOp();
++i;
}
delete iter;
thread->stats.AddBytes(bytes);
}
void ReadReverse(ThreadState* thread) {
Iterator* iter = db_->NewIterator(ReadOptions());
int i = 0;
int64_t bytes = 0;
for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
bytes += iter->key().size() + iter->value().size();
thread->stats.FinishedSingleOp();
++i;
}
delete iter;
thread->stats.AddBytes(bytes);
}
void ReadRandom(ThreadState* thread) {
ReadOptions options;
std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k);
if (db_->Get(options, key, &value).ok()) {
found++;
}
thread->stats.FinishedSingleOp();
}
char msg[100];
snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
thread->stats.AddMessage(msg);
}
void ReadMissing(ThreadState* thread) {
ReadOptions options;
std::string value;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d.", k);
db_->Get(options, key, &value);
thread->stats.FinishedSingleOp();
}
}
void ReadHot(ThreadState* thread) {
ReadOptions options;
std::string value;
const int range = (FLAGS_num + 99) / 100;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = thread->rand.Next() % range;
snprintf(key, sizeof(key), "%016d", k);
db_->Get(options, key, &value);
thread->stats.FinishedSingleOp();
}
}
void SeekRandom(ThreadState* thread) {
ReadOptions options;
std::string value;
int found = 0;
for (int i = 0; i < reads_; i++) {
Iterator* iter = db_->NewIterator(options);
char key[100];
const int k = thread->rand.Next() % FLAGS_num;
snprintf(key, sizeof(key), "%016d", k);
iter->Seek(key);
if (iter->Valid() && iter->key() == key) found++;
delete iter;
thread->stats.FinishedSingleOp();
}
char msg[100];
snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
thread->stats.AddMessage(msg);
}
void DoDelete(ThreadState* thread, bool seq) {
RandomGenerator gen;
WriteBatch batch;
Status s;
for (int i = 0; i < num_; i += entries_per_batch_) {
batch.Clear();
for (int j = 0; j < entries_per_batch_; j++) {
const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
batch.Delete(key);
thread->stats.FinishedSingleOp();
}
s = db_->Write(write_options_, &batch);
if (!s.ok()) {
fprintf(stderr, "del error: %s\n", s.ToString().c_str());
exit(1);
}
}
}
void DeleteSeq(ThreadState* thread) {
DoDelete(thread, true);
}
void DeleteRandom(ThreadState* thread) {
DoDelete(thread, false);
}
void ReadWhileWriting(ThreadState* thread) {
if (thread->tid > 0) {
ReadRandom(thread);
} else {
// Special thread that keeps writing until other threads are done.
RandomGenerator gen;
while (true) {
{
MutexLock l(&thread->shared->mu);
if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
// Other threads have finished
break;
}
}
const int k = thread->rand.Next() % FLAGS_num;
char key[100];
snprintf(key, sizeof(key), "%016d", k);
Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1);
}
}
// Do not count any of the preceding work/delay in stats.
thread->stats.Start();
}
}
void Compact(ThreadState* thread) {
db_->CompactRange(NULL, NULL);
}
void PrintStats(const char* key) {
std::string stats;
if (!db_->GetProperty(key, &stats)) {
stats = "(failed)";
}
fprintf(stdout, "\n%s\n", stats.c_str());
}
static void WriteToFile(void* arg, const char* buf, int n) {
reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
}
void HeapProfile() {
char fname[100];
snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
WritableFile* file;
Status s = Env::Default()->NewWritableFile(fname, &file);
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
return;
}
bool ok = port::GetHeapProfile(WriteToFile, file);
delete file;
if (!ok) {
fprintf(stderr, "heap profiling not supported\n");
Env::Default()->DeleteFile(fname);
}
}
};
} // namespace leveldb
int main(int argc, char** argv) {
FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
FLAGS_open_files = leveldb::Options().max_open_files;
std::string default_db_path;
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_existing_db = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n;
} else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) {
FLAGS_threads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
FLAGS_write_buffer_size = n;
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
FLAGS_cache_size = n;
} else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
FLAGS_bloom_bits = n;
} else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
FLAGS_open_files = n;
} else if (strncmp(argv[i], "--db=", 5) == 0) {
FLAGS_db = argv[i] + 5;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
// Choose a location for the test database if none given with --db=<path>
if (FLAGS_db == NULL) {
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
default_db_path += "/dbbench";
FLAGS_db = default_db_path.c_str();
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}

1498
src/leveldb/db/db_impl.cc Normal file

File diff suppressed because it is too large Load Diff

210
src/leveldb/db/db_impl.h Normal file
View File

@@ -0,0 +1,210 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
#define STORAGE_LEVELDB_DB_DB_IMPL_H_
#include <deque>
#include <set>
#include "db/dbformat.h"
#include "db/log_writer.h"
#include "db/snapshot.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "port/port.h"
#include "port/thread_annotations.h"
namespace leveldb {
class MemTable;
class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class DBImpl : public DB {
public:
DBImpl(const Options& options, const std::string& dbname);
virtual ~DBImpl();
// Implementations of the DB interface
virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
virtual Status Delete(const WriteOptions&, const Slice& key);
virtual Status Write(const WriteOptions& options, WriteBatch* updates);
virtual Status Get(const ReadOptions& options,
const Slice& key,
std::string* value);
virtual Iterator* NewIterator(const ReadOptions&);
virtual const Snapshot* GetSnapshot();
virtual void ReleaseSnapshot(const Snapshot* snapshot);
virtual bool GetProperty(const Slice& property, std::string* value);
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
virtual void CompactRange(const Slice* begin, const Slice* end);
// Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [*begin,*end]
void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
// Force current memtable contents to be compacted.
Status TEST_CompactMemTable();
// Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator();
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t TEST_MaxNextLevelOverlappingBytes();
// Record a sample of bytes read at the specified internal key.
// Samples are taken approximately once every config::kReadBytesPeriod
// bytes.
void RecordReadSample(Slice key);
private:
friend class DB;
struct CompactionState;
struct Writer;
Iterator* NewInternalIterator(const ReadOptions&,
SequenceNumber* latest_snapshot,
uint32_t* seed);
Status NewDB();
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
void MaybeIgnoreError(Status* s) const;
// Delete any unneeded files and stale in-memory entries.
void DeleteObsoleteFiles();
// Compact the in-memory write buffer to disk. Switches to a new
// log-file/memtable and writes a new descriptor iff successful.
Status CompactMemTable()
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status RecoverLogFile(uint64_t log_number,
VersionEdit* edit,
SequenceNumber* max_sequence)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status MakeRoomForWrite(bool force /* compact even if there is room? */)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
WriteBatch* BuildBatchGroup(Writer** last_writer);
void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
static void BGWork(void* db);
void BackgroundCall();
Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
void CleanupCompaction(CompactionState* compact)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status DoCompactionWork(CompactionState* compact)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status OpenCompactionOutputFile(CompactionState* compact);
Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
Status InstallCompactionResults(CompactionState* compact)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Constant after construction
Env* const env_;
const InternalKeyComparator internal_comparator_;
const InternalFilterPolicy internal_filter_policy_;
const Options options_; // options_.comparator == &internal_comparator_
bool owns_info_log_;
bool owns_cache_;
const std::string dbname_;
// table_cache_ provides its own synchronization
TableCache* table_cache_;
// Lock over the persistent DB state. Non-NULL iff successfully acquired.
FileLock* db_lock_;
// State below is protected by mutex_
port::Mutex mutex_;
port::AtomicPointer shutting_down_;
port::CondVar bg_cv_; // Signalled when background work finishes
MemTable* mem_;
MemTable* imm_; // Memtable being compacted
port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
WritableFile* logfile_;
uint64_t logfile_number_;
log::Writer* log_;
uint32_t seed_; // For sampling.
// Queue of writers.
std::deque<Writer*> writers_;
WriteBatch* tmp_batch_;
SnapshotList snapshots_;
// Set of table files to protect from deletion because they are
// part of ongoing compactions.
std::set<uint64_t> pending_outputs_;
// Has a background compaction been scheduled or is running?
bool bg_compaction_scheduled_;
// Information for a manual compaction
struct ManualCompaction {
int level;
bool done;
const InternalKey* begin; // NULL means beginning of key range
const InternalKey* end; // NULL means end of key range
InternalKey tmp_storage; // Used to keep track of compaction progress
};
ManualCompaction* manual_compaction_;
VersionSet* versions_;
// Have we encountered a background error in paranoid mode?
Status bg_error_;
int consecutive_compaction_errors_;
// Per level compaction stats. stats_[level] stores the stats for
// compactions that produced data for the specified "level".
struct CompactionStats {
int64_t micros;
int64_t bytes_read;
int64_t bytes_written;
CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { }
void Add(const CompactionStats& c) {
this->micros += c.micros;
this->bytes_read += c.bytes_read;
this->bytes_written += c.bytes_written;
}
};
CompactionStats stats_[config::kNumLevels];
// No copying allowed
DBImpl(const DBImpl&);
void operator=(const DBImpl&);
const Comparator* user_comparator() const {
return internal_comparator_.user_comparator();
}
};
// Sanitize db options. The caller should delete result.info_log if
// it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src);
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_

316
src/leveldb/db/db_iter.cc Normal file
View File

@@ -0,0 +1,316 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_iter.h"
#include "db/filename.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "leveldb/env.h"
#include "leveldb/iterator.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/mutexlock.h"
#include "util/random.h"
namespace leveldb {
#if 0
static void DumpInternalIter(Iterator* iter) {
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey k;
if (!ParseInternalKey(iter->key(), &k)) {
fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
} else {
fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
}
}
}
#endif
namespace {
// Memtables and sstables that make the DB representation contain
// (userkey,seq,type) => uservalue entries. DBIter
// combines multiple entries for the same userkey found in the DB
// representation into a single entry while accounting for sequence
// numbers, deletion markers, overwrites, etc.
class DBIter: public Iterator {
public:
// Which direction is the iterator currently moving?
// (1) When moving forward, the internal iterator is positioned at
// the exact entry that yields this->key(), this->value()
// (2) When moving backwards, the internal iterator is positioned
// just before all entries whose user key == this->key().
enum Direction {
kForward,
kReverse
};
DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
uint32_t seed)
: db_(db),
user_comparator_(cmp),
iter_(iter),
sequence_(s),
direction_(kForward),
valid_(false),
rnd_(seed),
bytes_counter_(RandomPeriod()) {
}
virtual ~DBIter() {
delete iter_;
}
virtual bool Valid() const { return valid_; }
virtual Slice key() const {
assert(valid_);
return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_;
}
virtual Slice value() const {
assert(valid_);
return (direction_ == kForward) ? iter_->value() : saved_value_;
}
virtual Status status() const {
if (status_.ok()) {
return iter_->status();
} else {
return status_;
}
}
virtual void Next();
virtual void Prev();
virtual void Seek(const Slice& target);
virtual void SeekToFirst();
virtual void SeekToLast();
private:
void FindNextUserEntry(bool skipping, std::string* skip);
void FindPrevUserEntry();
bool ParseKey(ParsedInternalKey* key);
inline void SaveKey(const Slice& k, std::string* dst) {
dst->assign(k.data(), k.size());
}
inline void ClearSavedValue() {
if (saved_value_.capacity() > 1048576) {
std::string empty;
swap(empty, saved_value_);
} else {
saved_value_.clear();
}
}
// Pick next gap with average value of config::kReadBytesPeriod.
ssize_t RandomPeriod() {
return rnd_.Uniform(2*config::kReadBytesPeriod);
}
DBImpl* db_;
const Comparator* const user_comparator_;
Iterator* const iter_;
SequenceNumber const sequence_;
Status status_;
std::string saved_key_; // == current key when direction_==kReverse
std::string saved_value_; // == current raw value when direction_==kReverse
Direction direction_;
bool valid_;
Random rnd_;
ssize_t bytes_counter_;
// No copying allowed
DBIter(const DBIter&);
void operator=(const DBIter&);
};
inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
Slice k = iter_->key();
ssize_t n = k.size() + iter_->value().size();
bytes_counter_ -= n;
while (bytes_counter_ < 0) {
bytes_counter_ += RandomPeriod();
db_->RecordReadSample(k);
}
if (!ParseInternalKey(k, ikey)) {
status_ = Status::Corruption("corrupted internal key in DBIter");
return false;
} else {
return true;
}
}
void DBIter::Next() {
assert(valid_);
if (direction_ == kReverse) { // Switch directions?
direction_ = kForward;
// iter_ is pointing just before the entries for this->key(),
// so advance into the range of entries for this->key() and then
// use the normal skipping code below.
if (!iter_->Valid()) {
iter_->SeekToFirst();
} else {
iter_->Next();
}
if (!iter_->Valid()) {
valid_ = false;
saved_key_.clear();
return;
}
}
// Temporarily use saved_key_ as storage for key to skip.
std::string* skip = &saved_key_;
SaveKey(ExtractUserKey(iter_->key()), skip);
FindNextUserEntry(true, skip);
}
void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
// Loop until we hit an acceptable entry to yield
assert(iter_->Valid());
assert(direction_ == kForward);
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
switch (ikey.type) {
case kTypeDeletion:
// Arrange to skip all upcoming entries for this key since
// they are hidden by this deletion.
SaveKey(ikey.user_key, skip);
skipping = true;
break;
case kTypeValue:
if (skipping &&
user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
// Entry hidden
} else {
valid_ = true;
saved_key_.clear();
return;
}
break;
}
}
iter_->Next();
} while (iter_->Valid());
saved_key_.clear();
valid_ = false;
}
void DBIter::Prev() {
assert(valid_);
if (direction_ == kForward) { // Switch directions?
// iter_ is pointing at the current entry. Scan backwards until
// the key changes so we can use the normal reverse scanning code.
assert(iter_->Valid()); // Otherwise valid_ would have been false
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
while (true) {
iter_->Prev();
if (!iter_->Valid()) {
valid_ = false;
saved_key_.clear();
ClearSavedValue();
return;
}
if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
saved_key_) < 0) {
break;
}
}
direction_ = kReverse;
}
FindPrevUserEntry();
}
void DBIter::FindPrevUserEntry() {
assert(direction_ == kReverse);
ValueType value_type = kTypeDeletion;
if (iter_->Valid()) {
do {
ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if ((value_type != kTypeDeletion) &&
user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
// We encountered a non-deleted value in entries for previous keys,
break;
}
value_type = ikey.type;
if (value_type == kTypeDeletion) {
saved_key_.clear();
ClearSavedValue();
} else {
Slice raw_value = iter_->value();
if (saved_value_.capacity() > raw_value.size() + 1048576) {
std::string empty;
swap(empty, saved_value_);
}
SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
saved_value_.assign(raw_value.data(), raw_value.size());
}
}
iter_->Prev();
} while (iter_->Valid());
}
if (value_type == kTypeDeletion) {
// End
valid_ = false;
saved_key_.clear();
ClearSavedValue();
direction_ = kForward;
} else {
valid_ = true;
}
}
void DBIter::Seek(const Slice& target) {
direction_ = kForward;
ClearSavedValue();
saved_key_.clear();
AppendInternalKey(
&saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
iter_->Seek(saved_key_);
if (iter_->Valid()) {
FindNextUserEntry(false, &saved_key_ /* temporary storage */);
} else {
valid_ = false;
}
}
void DBIter::SeekToFirst() {
direction_ = kForward;
ClearSavedValue();
iter_->SeekToFirst();
if (iter_->Valid()) {
FindNextUserEntry(false, &saved_key_ /* temporary storage */);
} else {
valid_ = false;
}
}
void DBIter::SeekToLast() {
direction_ = kReverse;
ClearSavedValue();
iter_->SeekToLast();
FindPrevUserEntry();
}
} // anonymous namespace
Iterator* NewDBIterator(
DBImpl* db,
const Comparator* user_key_comparator,
Iterator* internal_iter,
SequenceNumber sequence,
uint32_t seed) {
return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
}
} // namespace leveldb

28
src/leveldb/db/db_iter.h Normal file
View File

@@ -0,0 +1,28 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
#define STORAGE_LEVELDB_DB_DB_ITER_H_
#include <stdint.h>
#include "leveldb/db.h"
#include "db/dbformat.h"
namespace leveldb {
class DBImpl;
// Return a new iterator that converts internal keys (yielded by
// "*internal_iter") that were live at the specified "sequence" number
// into appropriate user keys.
extern Iterator* NewDBIterator(
DBImpl* db,
const Comparator* user_key_comparator,
Iterator* internal_iter,
SequenceNumber sequence,
uint32_t seed);
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_DB_ITER_H_

2092
src/leveldb/db/db_test.cc Normal file

File diff suppressed because it is too large Load Diff

140
src/leveldb/db/dbformat.cc Normal file
View File

@@ -0,0 +1,140 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "db/dbformat.h"
#include "port/port.h"
#include "util/coding.h"
namespace leveldb {
static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
assert(seq <= kMaxSequenceNumber);
assert(t <= kValueTypeForSeek);
return (seq << 8) | t;
}
void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
result->append(key.user_key.data(), key.user_key.size());
PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
}
std::string ParsedInternalKey::DebugString() const {
char buf[50];
snprintf(buf, sizeof(buf), "' @ %llu : %d",
(unsigned long long) sequence,
int(type));
std::string result = "'";
result += EscapeString(user_key.ToString());
result += buf;
return result;
}
std::string InternalKey::DebugString() const {
std::string result;
ParsedInternalKey parsed;
if (ParseInternalKey(rep_, &parsed)) {
result = parsed.DebugString();
} else {
result = "(bad)";
result.append(EscapeString(rep_));
}
return result;
}
const char* InternalKeyComparator::Name() const {
return "leveldb.InternalKeyComparator";
}
int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
if (r == 0) {
const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
if (anum > bnum) {
r = -1;
} else if (anum < bnum) {
r = +1;
}
}
return r;
}
void InternalKeyComparator::FindShortestSeparator(
std::string* start,
const Slice& limit) const {
// Attempt to shorten the user portion of the key
Slice user_start = ExtractUserKey(*start);
Slice user_limit = ExtractUserKey(limit);
std::string tmp(user_start.data(), user_start.size());
user_comparator_->FindShortestSeparator(&tmp, user_limit);
if (tmp.size() < user_start.size() &&
user_comparator_->Compare(user_start, tmp) < 0) {
// User key has become shorter physically, but larger logically.
// Tack on the earliest possible number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*start, tmp) < 0);
assert(this->Compare(tmp, limit) < 0);
start->swap(tmp);
}
}
void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
Slice user_key = ExtractUserKey(*key);
std::string tmp(user_key.data(), user_key.size());
user_comparator_->FindShortSuccessor(&tmp);
if (tmp.size() < user_key.size() &&
user_comparator_->Compare(user_key, tmp) < 0) {
// User key has become shorter physically, but larger logically.
// Tack on the earliest possible number to the shortened user key.
PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
assert(this->Compare(*key, tmp) < 0);
key->swap(tmp);
}
}
const char* InternalFilterPolicy::Name() const {
return user_policy_->Name();
}
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We rely on the fact that the code in table.cc does not mind us
// adjusting keys[].
Slice* mkey = const_cast<Slice*>(keys);
for (int i = 0; i < n; i++) {
mkey[i] = ExtractUserKey(keys[i]);
// TODO(sanjay): Suppress dups?
}
user_policy_->CreateFilter(keys, n, dst);
}
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate
char* dst;
if (needed <= sizeof(space_)) {
dst = space_;
} else {
dst = new char[needed];
}
start_ = dst;
dst = EncodeVarint32(dst, usize + 8);
kstart_ = dst;
memcpy(dst, user_key.data(), usize);
dst += usize;
EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
dst += 8;
end_ = dst;
}
} // namespace leveldb

230
src/leveldb/db/dbformat.h Normal file
View File

@@ -0,0 +1,230 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
#define STORAGE_LEVELDB_DB_FORMAT_H_
#include <stdio.h>
#include "leveldb/comparator.h"
#include "leveldb/db.h"
#include "leveldb/filter_policy.h"
#include "leveldb/slice.h"
#include "leveldb/table_builder.h"
#include "util/coding.h"
#include "util/logging.h"
namespace leveldb {
// Grouping of constants. We may want to make some of these
// parameters set via options.
namespace config {
static const int kNumLevels = 7;
// Level-0 compaction is started when we hit this many files.
static const int kL0_CompactionTrigger = 4;
// Soft limit on number of level-0 files. We slow down writes at this point.
static const int kL0_SlowdownWritesTrigger = 8;
// Maximum number of level-0 files. We stop writes at this point.
static const int kL0_StopWritesTrigger = 12;
// Maximum level to which a new compacted memtable is pushed if it
// does not create overlap. We try to push to level 2 to avoid the
// relatively expensive level 0=>1 compactions and to avoid some
// expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten.
static const int kMaxMemCompactLevel = 2;
// Approximate gap in bytes between samples of data read during iteration.
static const int kReadBytesPeriod = 1048576;
} // namespace config
class InternalKey;
// Value types encoded as the last component of internal keys.
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
// data structures.
enum ValueType {
kTypeDeletion = 0x0,
kTypeValue = 0x1
};
// kValueTypeForSeek defines the ValueType that should be passed when
// constructing a ParsedInternalKey object for seeking to a particular
// sequence number (since we sort sequence numbers in decreasing order
// and the value type is embedded as the low 8 bits in the sequence
// number in internal keys, we need to use the highest-numbered
// ValueType, not the lowest).
static const ValueType kValueTypeForSeek = kTypeValue;
typedef uint64_t SequenceNumber;
// We leave eight bits empty at the bottom so a type and sequence#
// can be packed together into 64-bits.
static const SequenceNumber kMaxSequenceNumber =
((0x1ull << 56) - 1);
struct ParsedInternalKey {
Slice user_key;
SequenceNumber sequence;
ValueType type;
ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
: user_key(u), sequence(seq), type(t) { }
std::string DebugString() const;
};
// Return the length of the encoding of "key".
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
return key.user_key.size() + 8;
}
// Append the serialization of "key" to *result.
extern void AppendInternalKey(std::string* result,
const ParsedInternalKey& key);
// Attempt to parse an internal key from "internal_key". On success,
// stores the parsed data in "*result", and returns true.
//
// On error, returns false, leaves "*result" in an undefined state.
extern bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result);
// Returns the user key portion of an internal key.
inline Slice ExtractUserKey(const Slice& internal_key) {
assert(internal_key.size() >= 8);
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
// A comparator for internal keys that uses a specified comparator for
// the user key portion and breaks ties by decreasing sequence number.
class InternalKeyComparator : public Comparator {
private:
const Comparator* user_comparator_;
public:
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
virtual const char* Name() const;
virtual int Compare(const Slice& a, const Slice& b) const;
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const;
virtual void FindShortSuccessor(std::string* key) const;
const Comparator* user_comparator() const { return user_comparator_; }
int Compare(const InternalKey& a, const InternalKey& b) const;
};
// Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy {
private:
const FilterPolicy* const user_policy_;
public:
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
virtual const char* Name() const;
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
};
// Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator.
class InternalKey {
private:
std::string rep_;
public:
InternalKey() { } // Leave rep_ as empty to indicate it is invalid
InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
}
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
Slice Encode() const {
assert(!rep_.empty());
return rep_;
}
Slice user_key() const { return ExtractUserKey(rep_); }
void SetFrom(const ParsedInternalKey& p) {
rep_.clear();
AppendInternalKey(&rep_, p);
}
void Clear() { rep_.clear(); }
std::string DebugString() const;
};
inline int InternalKeyComparator::Compare(
const InternalKey& a, const InternalKey& b) const {
return Compare(a.Encode(), b.Encode());
}
inline bool ParseInternalKey(const Slice& internal_key,
ParsedInternalKey* result) {
const size_t n = internal_key.size();
if (n < 8) return false;
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
unsigned char c = num & 0xff;
result->sequence = num >> 8;
result->type = static_cast<ValueType>(c);
result->user_key = Slice(internal_key.data(), n - 8);
return (c <= static_cast<unsigned char>(kTypeValue));
}
// A helper class useful for DBImpl::Get()
class LookupKey {
public:
// Initialize *this for looking up user_key at a snapshot with
// the specified sequence number.
LookupKey(const Slice& user_key, SequenceNumber sequence);
~LookupKey();
// Return a key suitable for lookup in a MemTable.
Slice memtable_key() const { return Slice(start_, end_ - start_); }
// Return an internal key (suitable for passing to an internal iterator)
Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
// Return the user key
Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
private:
// We construct a char array of the form:
// klength varint32 <-- start_
// userkey char[klength] <-- kstart_
// tag uint64
// <-- end_
// The array is a suitable MemTable key.
// The suffix starting with "userkey" can be used as an InternalKey.
const char* start_;
const char* kstart_;
const char* end_;
char space_[200]; // Avoid allocation for short keys
// No copying allowed
LookupKey(const LookupKey&);
void operator=(const LookupKey&);
};
inline LookupKey::~LookupKey() {
if (start_ != space_) delete[] start_;
}
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_FORMAT_H_

View File

@@ -0,0 +1,112 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/dbformat.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
static std::string IKey(const std::string& user_key,
uint64_t seq,
ValueType vt) {
std::string encoded;
AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
return encoded;
}
static std::string Shorten(const std::string& s, const std::string& l) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
return result;
}
static std::string ShortSuccessor(const std::string& s) {
std::string result = s;
InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
return result;
}
static void TestKey(const std::string& key,
uint64_t seq,
ValueType vt) {
std::string encoded = IKey(key, seq, vt);
Slice in(encoded);
ParsedInternalKey decoded("", 0, kTypeValue);
ASSERT_TRUE(ParseInternalKey(in, &decoded));
ASSERT_EQ(key, decoded.user_key.ToString());
ASSERT_EQ(seq, decoded.sequence);
ASSERT_EQ(vt, decoded.type);
ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
}
class FormatTest { };
TEST(FormatTest, InternalKey_EncodeDecode) {
const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
const uint64_t seq[] = {
1, 2, 3,
(1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
(1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
(1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
};
for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
TestKey(keys[k], seq[s], kTypeValue);
TestKey("hello", 1, kTypeDeletion);
}
}
}
TEST(FormatTest, InternalKeyShortSeparator) {
// When user keys are same
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 99, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 101, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foo", 100, kTypeDeletion)));
// When user keys are misordered
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("bar", 99, kTypeValue)));
// When user keys are different, but correctly ordered
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
Shorten(IKey("foo", 100, kTypeValue),
IKey("hello", 200, kTypeValue)));
// When start user key is prefix of limit user key
ASSERT_EQ(IKey("foo", 100, kTypeValue),
Shorten(IKey("foo", 100, kTypeValue),
IKey("foobar", 200, kTypeValue)));
// When limit user key is prefix of start user key
ASSERT_EQ(IKey("foobar", 100, kTypeValue),
Shorten(IKey("foobar", 100, kTypeValue),
IKey("foo", 200, kTypeValue)));
}
TEST(FormatTest, InternalKeyShortestSuccessor) {
ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
ShortSuccessor(IKey("foo", 100, kTypeValue)));
ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

139
src/leveldb/db/filename.cc Normal file
View File

@@ -0,0 +1,139 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <ctype.h>
#include <stdio.h>
#include "db/filename.h"
#include "db/dbformat.h"
#include "leveldb/env.h"
#include "util/logging.h"
namespace leveldb {
// A utility routine: write "data" to the named file and Sync() it.
extern Status WriteStringToFileSync(Env* env, const Slice& data,
const std::string& fname);
static std::string MakeFileName(const std::string& name, uint64_t number,
const char* suffix) {
char buf[100];
snprintf(buf, sizeof(buf), "/%06llu.%s",
static_cast<unsigned long long>(number),
suffix);
return name + buf;
}
std::string LogFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "log");
}
std::string TableFileName(const std::string& name, uint64_t number) {
assert(number > 0);
return MakeFileName(name, number, "sst");
}
std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
char buf[100];
snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
static_cast<unsigned long long>(number));
return dbname + buf;
}
std::string CurrentFileName(const std::string& dbname) {
return dbname + "/CURRENT";
}
std::string LockFileName(const std::string& dbname) {
return dbname + "/LOCK";
}
std::string TempFileName(const std::string& dbname, uint64_t number) {
assert(number > 0);
return MakeFileName(dbname, number, "dbtmp");
}
std::string InfoLogFileName(const std::string& dbname) {
return dbname + "/LOG";
}
// Return the name of the old info log file for "dbname".
std::string OldInfoLogFileName(const std::string& dbname) {
return dbname + "/LOG.old";
}
// Owned filenames have the form:
// dbname/CURRENT
// dbname/LOCK
// dbname/LOG
// dbname/LOG.old
// dbname/MANIFEST-[0-9]+
// dbname/[0-9]+.(log|sst)
bool ParseFileName(const std::string& fname,
uint64_t* number,
FileType* type) {
Slice rest(fname);
if (rest == "CURRENT") {
*number = 0;
*type = kCurrentFile;
} else if (rest == "LOCK") {
*number = 0;
*type = kDBLockFile;
} else if (rest == "LOG" || rest == "LOG.old") {
*number = 0;
*type = kInfoLogFile;
} else if (rest.starts_with("MANIFEST-")) {
rest.remove_prefix(strlen("MANIFEST-"));
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (!rest.empty()) {
return false;
}
*type = kDescriptorFile;
*number = num;
} else {
// Avoid strtoull() to keep filename format independent of the
// current locale
uint64_t num;
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
Slice suffix = rest;
if (suffix == Slice(".log")) {
*type = kLogFile;
} else if (suffix == Slice(".sst")) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
*type = kTempFile;
} else {
return false;
}
*number = num;
}
return true;
}
Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number) {
// Remove leading "dbname/" and add newline to manifest file name
std::string manifest = DescriptorFileName(dbname, descriptor_number);
Slice contents = manifest;
assert(contents.starts_with(dbname + "/"));
contents.remove_prefix(dbname.size() + 1);
std::string tmp = TempFileName(dbname, descriptor_number);
Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
if (s.ok()) {
s = env->RenameFile(tmp, CurrentFileName(dbname));
}
if (!s.ok()) {
env->DeleteFile(tmp);
}
return s;
}
} // namespace leveldb

80
src/leveldb/db/filename.h Normal file
View File

@@ -0,0 +1,80 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// File names used by DB code
#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
#define STORAGE_LEVELDB_DB_FILENAME_H_
#include <stdint.h>
#include <string>
#include "leveldb/slice.h"
#include "leveldb/status.h"
#include "port/port.h"
namespace leveldb {
class Env;
enum FileType {
kLogFile,
kDBLockFile,
kTableFile,
kDescriptorFile,
kCurrentFile,
kTempFile,
kInfoLogFile // Either the current one, or an old one
};
// Return the name of the log file with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string LogFileName(const std::string& dbname, uint64_t number);
// Return the name of the sstable with the specified number
// in the db named by "dbname". The result will be prefixed with
// "dbname".
extern std::string TableFileName(const std::string& dbname, uint64_t number);
// Return the name of the descriptor file for the db named by
// "dbname" and the specified incarnation number. The result will be
// prefixed with "dbname".
extern std::string DescriptorFileName(const std::string& dbname,
uint64_t number);
// Return the name of the current file. This file contains the name
// of the current manifest file. The result will be prefixed with
// "dbname".
extern std::string CurrentFileName(const std::string& dbname);
// Return the name of the lock file for the db named by
// "dbname". The result will be prefixed with "dbname".
extern std::string LockFileName(const std::string& dbname);
// Return the name of a temporary file owned by the db named "dbname".
// The result will be prefixed with "dbname".
extern std::string TempFileName(const std::string& dbname, uint64_t number);
// Return the name of the info log file for "dbname".
extern std::string InfoLogFileName(const std::string& dbname);
// Return the name of the old info log file for "dbname".
extern std::string OldInfoLogFileName(const std::string& dbname);
// If filename is a leveldb file, store the type of the file in *type.
// The number encoded in the filename is stored in *number. If the
// filename was successfully parsed, returns true. Else return false.
extern bool ParseFileName(const std::string& filename,
uint64_t* number,
FileType* type);
// Make the CURRENT file point to the descriptor file with the
// specified number.
extern Status SetCurrentFile(Env* env, const std::string& dbname,
uint64_t descriptor_number);
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_FILENAME_H_

View File

@@ -0,0 +1,122 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/filename.h"
#include "db/dbformat.h"
#include "port/port.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
class FileNameTest { };
TEST(FileNameTest, Parse) {
Slice db;
FileType type;
uint64_t number;
// Successful parses
static struct {
const char* fname;
uint64_t number;
FileType type;
} cases[] = {
{ "100.log", 100, kLogFile },
{ "0.log", 0, kLogFile },
{ "0.sst", 0, kTableFile },
{ "CURRENT", 0, kCurrentFile },
{ "LOCK", 0, kDBLockFile },
{ "MANIFEST-2", 2, kDescriptorFile },
{ "MANIFEST-7", 7, kDescriptorFile },
{ "LOG", 0, kInfoLogFile },
{ "LOG.old", 0, kInfoLogFile },
{ "18446744073709551615.log", 18446744073709551615ull, kLogFile },
};
for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
std::string f = cases[i].fname;
ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
ASSERT_EQ(cases[i].type, type) << f;
ASSERT_EQ(cases[i].number, number) << f;
}
// Errors
static const char* errors[] = {
"",
"foo",
"foo-dx-100.log",
".log",
"",
"manifest",
"CURREN",
"CURRENTX",
"MANIFES",
"MANIFEST",
"MANIFEST-",
"XMANIFEST-3",
"MANIFEST-3x",
"LOC",
"LOCKx",
"LO",
"LOGx",
"18446744073709551616.log",
"184467440737095516150.log",
"100",
"100.",
"100.lop"
};
for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
std::string f = errors[i];
ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
}
}
TEST(FileNameTest, Construction) {
uint64_t number;
FileType type;
std::string fname;
fname = CurrentFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kCurrentFile, type);
fname = LockFileName("foo");
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(0, number);
ASSERT_EQ(kDBLockFile, type);
fname = LogFileName("foo", 192);
ASSERT_EQ("foo/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(192, number);
ASSERT_EQ(kLogFile, type);
fname = TableFileName("bar", 200);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(200, number);
ASSERT_EQ(kTableFile, type);
fname = DescriptorFileName("bar", 100);
ASSERT_EQ("bar/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(100, number);
ASSERT_EQ(kDescriptorFile, type);
fname = TempFileName("tmp", 999);
ASSERT_EQ("tmp/", std::string(fname.data(), 4));
ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
ASSERT_EQ(999, number);
ASSERT_EQ(kTempFile, type);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@@ -0,0 +1,238 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "leveldb/env.h"
#include "leveldb/iterator.h"
#include "leveldb/options.h"
#include "leveldb/status.h"
#include "leveldb/table.h"
#include "leveldb/write_batch.h"
#include "util/logging.h"
namespace leveldb {
namespace {
bool GuessType(const std::string& fname, FileType* type) {
size_t pos = fname.rfind('/');
std::string basename;
if (pos == std::string::npos) {
basename = fname;
} else {
basename = std::string(fname.data() + pos + 1, fname.size() - pos - 1);
}
uint64_t ignored;
return ParseFileName(basename, &ignored, type);
}
// Notified when log reader encounters corruption.
class CorruptionReporter : public log::Reader::Reporter {
public:
virtual void Corruption(size_t bytes, const Status& status) {
printf("corruption: %d bytes; %s\n",
static_cast<int>(bytes),
status.ToString().c_str());
}
};
// Print contents of a log file. (*func)() is called on every record.
bool PrintLogContents(Env* env, const std::string& fname,
void (*func)(Slice)) {
SequentialFile* file;
Status s = env->NewSequentialFile(fname, &file);
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
return false;
}
CorruptionReporter reporter;
log::Reader reader(file, &reporter, true, 0);
Slice record;
std::string scratch;
while (reader.ReadRecord(&record, &scratch)) {
printf("--- offset %llu; ",
static_cast<unsigned long long>(reader.LastRecordOffset()));
(*func)(record);
}
delete file;
return true;
}
// Called on every item found in a WriteBatch.
class WriteBatchItemPrinter : public WriteBatch::Handler {
public:
uint64_t offset_;
uint64_t sequence_;
virtual void Put(const Slice& key, const Slice& value) {
printf(" put '%s' '%s'\n",
EscapeString(key).c_str(),
EscapeString(value).c_str());
}
virtual void Delete(const Slice& key) {
printf(" del '%s'\n",
EscapeString(key).c_str());
}
};
// Called on every log record (each one of which is a WriteBatch)
// found in a kLogFile.
static void WriteBatchPrinter(Slice record) {
if (record.size() < 12) {
printf("log record length %d is too small\n",
static_cast<int>(record.size()));
return;
}
WriteBatch batch;
WriteBatchInternal::SetContents(&batch, record);
printf("sequence %llu\n",
static_cast<unsigned long long>(WriteBatchInternal::Sequence(&batch)));
WriteBatchItemPrinter batch_item_printer;
Status s = batch.Iterate(&batch_item_printer);
if (!s.ok()) {
printf(" error: %s\n", s.ToString().c_str());
}
}
bool DumpLog(Env* env, const std::string& fname) {
return PrintLogContents(env, fname, WriteBatchPrinter);
}
// Called on every log record (each one of which is a WriteBatch)
// found in a kDescriptorFile.
static void VersionEditPrinter(Slice record) {
VersionEdit edit;
Status s = edit.DecodeFrom(record);
if (!s.ok()) {
printf("%s\n", s.ToString().c_str());
return;
}
printf("%s", edit.DebugString().c_str());
}
bool DumpDescriptor(Env* env, const std::string& fname) {
return PrintLogContents(env, fname, VersionEditPrinter);
}
bool DumpTable(Env* env, const std::string& fname) {
uint64_t file_size;
RandomAccessFile* file = NULL;
Table* table = NULL;
Status s = env->GetFileSize(fname, &file_size);
if (s.ok()) {
s = env->NewRandomAccessFile(fname, &file);
}
if (s.ok()) {
// We use the default comparator, which may or may not match the
// comparator used in this database. However this should not cause
// problems since we only use Table operations that do not require
// any comparisons. In particular, we do not call Seek or Prev.
s = Table::Open(Options(), file, file_size, &table);
}
if (!s.ok()) {
fprintf(stderr, "%s\n", s.ToString().c_str());
delete table;
delete file;
return false;
}
ReadOptions ro;
ro.fill_cache = false;
Iterator* iter = table->NewIterator(ro);
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey key;
if (!ParseInternalKey(iter->key(), &key)) {
printf("badkey '%s' => '%s'\n",
EscapeString(iter->key()).c_str(),
EscapeString(iter->value()).c_str());
} else {
char kbuf[20];
const char* type;
if (key.type == kTypeDeletion) {
type = "del";
} else if (key.type == kTypeValue) {
type = "val";
} else {
snprintf(kbuf, sizeof(kbuf), "%d", static_cast<int>(key.type));
type = kbuf;
}
printf("'%s' @ %8llu : %s => '%s'\n",
EscapeString(key.user_key).c_str(),
static_cast<unsigned long long>(key.sequence),
type,
EscapeString(iter->value()).c_str());
}
}
s = iter->status();
if (!s.ok()) {
printf("iterator error: %s\n", s.ToString().c_str());
}
delete iter;
delete table;
delete file;
return true;
}
bool DumpFile(Env* env, const std::string& fname) {
FileType ftype;
if (!GuessType(fname, &ftype)) {
fprintf(stderr, "%s: unknown file type\n", fname.c_str());
return false;
}
switch (ftype) {
case kLogFile: return DumpLog(env, fname);
case kDescriptorFile: return DumpDescriptor(env, fname);
case kTableFile: return DumpTable(env, fname);
default: {
fprintf(stderr, "%s: not a dump-able file type\n", fname.c_str());
break;
}
}
return false;
}
bool HandleDumpCommand(Env* env, char** files, int num) {
bool ok = true;
for (int i = 0; i < num; i++) {
ok &= DumpFile(env, files[i]);
}
return ok;
}
}
} // namespace leveldb
static void Usage() {
fprintf(
stderr,
"Usage: leveldbutil command...\n"
" dump files... -- dump contents of specified files\n"
);
}
int main(int argc, char** argv) {
leveldb::Env* env = leveldb::Env::Default();
bool ok = true;
if (argc < 2) {
Usage();
ok = false;
} else {
std::string command = argv[1];
if (command == "dump") {
ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
} else {
Usage();
ok = false;
}
}
return (ok ? 0 : 1);
}

View File

@@ -0,0 +1,35 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Log format information shared by reader and writer.
// See ../doc/log_format.txt for more detail.
#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
namespace leveldb {
namespace log {
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
static const int kMaxRecordType = kLastType;
static const int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 1 + 2;
} // namespace log
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_

View File

@@ -0,0 +1,259 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include <stdio.h>
#include "leveldb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset)
: file_(file),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
buffer_(),
eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {
}
Reader::~Reader() {
delete[] backing_store_;
}
bool Reader::SkipToInitialBlock() {
size_t offset_in_block = initial_offset_ % kBlockSize;
uint64_t block_start_location = initial_offset_ - offset_in_block;
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) {
offset_in_block = 0;
block_start_location += kBlockSize;
}
end_of_buffer_offset_ = block_start_location;
// Skip to start of first block that can contain the initial record
if (block_start_location > 0) {
Status skip_status = file_->Skip(block_start_location);
if (!skip_status.ok()) {
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
}
}
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type = ReadPhysicalRecord(&fragment);
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset;
return true;
case kFirstType:
if (in_fragmented_record) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty()) {
in_fragmented_record = false;
} else {
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
} else {
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
} else {
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;
return true;
}
break;
case kEof:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "partial record without end(3)");
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default: {
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}
}
}
return false;
}
uint64_t Reader::LastRecordOffset() {
return last_record_offset_;
}
void Reader::ReportCorruption(size_t bytes, const char* reason) {
ReportDrop(bytes, Status::Corruption(reason));
}
void Reader::ReportDrop(size_t bytes, const Status& reason) {
if (reporter_ != NULL &&
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
reporter_->Corruption(bytes, reason);
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
while (true) {
if (buffer_.size() < kHeaderSize) {
if (!eof_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
} else if (buffer_.size() < kBlockSize) {
eof_ = true;
}
continue;
} else if (buffer_.size() == 0) {
// End of file
return kEof;
} else {
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "truncated record at end of file");
return kEof;
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
if (type == kZeroType && length == 0) {
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length);
return type;
}
}
} // namespace log
} // namespace leveldb

108
src/leveldb/db/log_reader.h Normal file
View File

@@ -0,0 +1,108 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
#define STORAGE_LEVELDB_DB_LOG_READER_H_
#include <stdint.h>
#include "db/log_format.h"
#include "leveldb/slice.h"
#include "leveldb/status.h"
namespace leveldb {
class SequentialFile;
namespace log {
class Reader {
public:
// Interface for reporting errors.
class Reporter {
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-NULL, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(SequentialFile* file, Reporter* reporter, bool checksum,
uint64_t initial_offset);
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch);
// Returns the physical offset of the last record returned by ReadRecord.
//
// Undefined before the first call to ReadRecord.
uint64_t LastRecordOffset();
private:
SequentialFile* const file_;
Reporter* const reporter_;
bool const checksum_;
char* const backing_store_;
Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
// Offset of the last record returned by ReadRecord.
uint64_t last_record_offset_;
// Offset of the first location past the end of buffer_.
uint64_t end_of_buffer_offset_;
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
// Skips all blocks that are completely before "initial_offset_".
//
// Returns true on success. Handles reporting.
bool SkipToInitialBlock();
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result);
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
} // namespace log
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_LOG_READER_H_

500
src/leveldb/db/log_test.cc Normal file
View File

@@ -0,0 +1,500 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "leveldb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
namespace log {
// Construct a string of the specified length made out of the supplied
// partial string.
static std::string BigString(const std::string& partial_string, size_t n) {
std::string result;
while (result.size() < n) {
result.append(partial_string);
}
result.resize(n);
return result;
}
// Construct a string from a number
static std::string NumberString(int n) {
char buf[50];
snprintf(buf, sizeof(buf), "%d.", n);
return std::string(buf);
}
// Return a skewed potentially long string
static std::string RandomSkewedString(int i, Random* rnd) {
return BigString(NumberString(i), rnd->Skewed(17));
}
class LogTest {
private:
class StringDest : public WritableFile {
public:
std::string contents_;
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
virtual Status Append(const Slice& slice) {
contents_.append(slice.data(), slice.size());
return Status::OK();
}
};
class StringSource : public SequentialFile {
public:
Slice contents_;
bool force_error_;
bool returned_partial_;
StringSource() : force_error_(false), returned_partial_(false) { }
virtual Status Read(size_t n, Slice* result, char* scratch) {
ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
if (force_error_) {
force_error_ = false;
returned_partial_ = true;
return Status::Corruption("read error");
}
if (contents_.size() < n) {
n = contents_.size();
returned_partial_ = true;
}
*result = Slice(contents_.data(), n);
contents_.remove_prefix(n);
return Status::OK();
}
virtual Status Skip(uint64_t n) {
if (n > contents_.size()) {
contents_.clear();
return Status::NotFound("in-memory file skipepd past end");
}
contents_.remove_prefix(n);
return Status::OK();
}
};
class ReportCollector : public Reader::Reporter {
public:
size_t dropped_bytes_;
std::string message_;
ReportCollector() : dropped_bytes_(0) { }
virtual void Corruption(size_t bytes, const Status& status) {
dropped_bytes_ += bytes;
message_.append(status.ToString());
}
};
StringDest dest_;
StringSource source_;
ReportCollector report_;
bool reading_;
Writer writer_;
Reader reader_;
// Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[];
public:
LogTest() : reading_(false),
writer_(&dest_),
reader_(&source_, &report_, true/*checksum*/,
0/*initial_offset*/) {
}
void Write(const std::string& msg) {
ASSERT_TRUE(!reading_) << "Write() after starting to read";
writer_.AddRecord(Slice(msg));
}
size_t WrittenBytes() const {
return dest_.contents_.size();
}
std::string Read() {
if (!reading_) {
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
}
std::string scratch;
Slice record;
if (reader_.ReadRecord(&record, &scratch)) {
return record.ToString();
} else {
return "EOF";
}
}
void IncrementByte(int offset, int delta) {
dest_.contents_[offset] += delta;
}
void SetByte(int offset, char new_byte) {
dest_.contents_[offset] = new_byte;
}
void ShrinkSize(int bytes) {
dest_.contents_.resize(dest_.contents_.size() - bytes);
}
void FixChecksum(int header_offset, int len) {
// Compute crc of type/len/data
uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
crc = crc32c::Mask(crc);
EncodeFixed32(&dest_.contents_[header_offset], crc);
}
void ForceError() {
source_.force_error_ = true;
}
size_t DroppedBytes() const {
return report_.dropped_bytes_;
}
std::string ReportMessage() const {
return report_.message_;
}
// Returns OK iff recorded error message contains "msg"
std::string MatchError(const std::string& msg) const {
if (report_.message_.find(msg) == std::string::npos) {
return report_.message_;
} else {
return "OK";
}
}
void WriteInitialOffsetLog() {
for (int i = 0; i < 4; i++) {
std::string record(initial_offset_record_sizes_[i],
static_cast<char>('a' + i));
Write(record);
}
}
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
WriteInitialOffsetLog();
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
WrittenBytes() + offset_past_end);
Slice record;
std::string scratch;
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
delete offset_reader;
}
void CheckInitialOffsetRecord(uint64_t initial_offset,
int expected_record_offset) {
WriteInitialOffsetLog();
reading_ = true;
source_.contents_ = Slice(dest_.contents_);
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
initial_offset);
Slice record;
std::string scratch;
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
record.size());
ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
offset_reader->LastRecordOffset());
ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
delete offset_reader;
}
};
size_t LogTest::initial_offset_record_sizes_[] =
{10000, // Two sizable records in first block
10000,
2 * log::kBlockSize - 1000, // Span three blocks
1};
uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0,
kHeaderSize + 10000,
2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, ReadWrite) {
Write("foo");
Write("bar");
Write("");
Write("xxxx");
ASSERT_EQ("foo", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("", Read());
ASSERT_EQ("xxxx", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
}
TEST(LogTest, ManyBlocks) {
for (int i = 0; i < 100000; i++) {
Write(NumberString(i));
}
for (int i = 0; i < 100000; i++) {
ASSERT_EQ(NumberString(i), Read());
}
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, Fragmentation) {
Write("small");
Write(BigString("medium", 50000));
Write(BigString("large", 100000));
ASSERT_EQ("small", Read());
ASSERT_EQ(BigString("medium", 50000), Read());
ASSERT_EQ(BigString("large", 100000), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, MarginalTrailer2) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(0, DroppedBytes());
ASSERT_EQ("", ReportMessage());
}
TEST(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("", Read());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, AlignedEof) {
const int n = kBlockSize - 2*kHeaderSize + 4;
Write(BigString("foo", n));
ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("EOF", Read());
}
TEST(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
for (int i = 0; i < N; i++) {
Write(RandomSkewedString(i, &write_rnd));
}
Random read_rnd(301);
for (int i = 0; i < N; i++) {
ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
}
ASSERT_EQ("EOF", Read());
}
// Tests of all the error paths in log_reader.cc follow:
TEST(LogTest, ReadError) {
Write("foo");
ForceError();
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kBlockSize, DroppedBytes());
ASSERT_EQ("OK", MatchError("read error"));
}
TEST(LogTest, BadRecordType) {
Write("foo");
// Type is stored in header[6]
IncrementByte(6, 100);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("unknown record type"));
}
TEST(LogTest, TruncatedTrailingRecord) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
ASSERT_EQ("OK", MatchError("truncated record at end of file"));
}
TEST(LogTest, BadLength) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST(LogTest, ChecksumMismatch) {
Write("foo");
IncrementByte(0, 10);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(10, DroppedBytes());
ASSERT_EQ("OK", MatchError("checksum mismatch"));
}
TEST(LogTest, UnexpectedMiddleType) {
Write("foo");
SetByte(6, kMiddleType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedLastType) {
Write("foo");
SetByte(6, kLastType);
FixChecksum(0, 3);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST(LogTest, UnexpectedFullType) {
Write("foo");
Write("bar");
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, UnexpectedFirstType) {
Write("foo");
Write(BigString("bar", 100000));
SetByte(6, kFirstType);
FixChecksum(0, 3);
ASSERT_EQ(BigString("bar", 100000), Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
// where the middle two fragments disappear. We do not want
// first(R1),last(R2) to get joined and returned as a valid record.
// Write records that span two blocks
Write(BigString("foo", kBlockSize));
Write(BigString("bar", kBlockSize));
Write("correct");
// Wipe the middle block
for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
SetByte(offset, 'x');
}
ASSERT_EQ("correct", Read());
ASSERT_EQ("EOF", Read());
const int dropped = DroppedBytes();
ASSERT_LE(dropped, 2*kBlockSize + 100);
ASSERT_GE(dropped, 2*kBlockSize);
}
TEST(LogTest, ReadStart) {
CheckInitialOffsetRecord(0, 0);
}
TEST(LogTest, ReadSecondOneOff) {
CheckInitialOffsetRecord(1, 1);
}
TEST(LogTest, ReadSecondTenThousand) {
CheckInitialOffsetRecord(10000, 1);
}
TEST(LogTest, ReadSecondStart) {
CheckInitialOffsetRecord(10007, 1);
}
TEST(LogTest, ReadThirdOneOff) {
CheckInitialOffsetRecord(10008, 2);
}
TEST(LogTest, ReadThirdStart) {
CheckInitialOffsetRecord(20014, 2);
}
TEST(LogTest, ReadFourthOneOff) {
CheckInitialOffsetRecord(20015, 3);
}
TEST(LogTest, ReadFourthFirstBlockTrailer) {
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
}
TEST(LogTest, ReadFourthMiddleBlock) {
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthLastBlock) {
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
}
TEST(LogTest, ReadFourthStart) {
CheckInitialOffsetRecord(
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
3);
}
TEST(LogTest, ReadEnd) {
CheckOffsetPastEndReturnsNoRecords(0);
}
TEST(LogTest, ReadPastEnd) {
CheckOffsetPastEndReturnsNoRecords(5);
}
} // namespace log
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@@ -0,0 +1,103 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/log_writer.h"
#include <stdint.h>
#include "leveldb/env.h"
#include "util/coding.h"
#include "util/crc32c.h"
namespace leveldb {
namespace log {
Writer::Writer(WritableFile* dest)
: dest_(dest),
block_offset_(0) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
}
}
Writer::~Writer() {
}
Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
Status s;
bool begin = true;
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover < kHeaderSize) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
assert(kHeaderSize == 7);
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
} else if (begin) {
type = kFirstType;
} else if (end) {
type = kLastType;
} else {
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && left > 0);
return s;
}
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
buf[6] = static_cast<char>(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
if (s.ok()) {
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + n;
return s;
}
} // namespace log
} // namespace leveldb

View File

@@ -0,0 +1,48 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
#include <stdint.h>
#include "db/log_format.h"
#include "leveldb/slice.h"
#include "leveldb/status.h"
namespace leveldb {
class WritableFile;
namespace log {
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest);
~Writer();
Status AddRecord(const Slice& slice);
private:
WritableFile* dest_;
int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1];
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
// No copying allowed
Writer(const Writer&);
void operator=(const Writer&);
};
} // namespace log
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_

145
src/leveldb/db/memtable.cc Normal file
View File

@@ -0,0 +1,145 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/memtable.h"
#include "db/dbformat.h"
#include "leveldb/comparator.h"
#include "leveldb/env.h"
#include "leveldb/iterator.h"
#include "util/coding.h"
namespace leveldb {
static Slice GetLengthPrefixedSlice(const char* data) {
uint32_t len;
const char* p = data;
p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
return Slice(p, len);
}
MemTable::MemTable(const InternalKeyComparator& cmp)
: comparator_(cmp),
refs_(0),
table_(comparator_, &arena_) {
}
MemTable::~MemTable() {
assert(refs_ == 0);
}
size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
const {
// Internal keys are encoded as length-prefixed strings.
Slice a = GetLengthPrefixedSlice(aptr);
Slice b = GetLengthPrefixedSlice(bptr);
return comparator.Compare(a, b);
}
// Encode a suitable internal key target for "target" and return it.
// Uses *scratch as scratch space, and the returned pointer will point
// into this scratch space.
static const char* EncodeKey(std::string* scratch, const Slice& target) {
scratch->clear();
PutVarint32(scratch, target.size());
scratch->append(target.data(), target.size());
return scratch->data();
}
class MemTableIterator: public Iterator {
public:
explicit MemTableIterator(MemTable::Table* table) : iter_(table) { }
virtual bool Valid() const { return iter_.Valid(); }
virtual void Seek(const Slice& k) { iter_.Seek(EncodeKey(&tmp_, k)); }
virtual void SeekToFirst() { iter_.SeekToFirst(); }
virtual void SeekToLast() { iter_.SeekToLast(); }
virtual void Next() { iter_.Next(); }
virtual void Prev() { iter_.Prev(); }
virtual Slice key() const { return GetLengthPrefixedSlice(iter_.key()); }
virtual Slice value() const {
Slice key_slice = GetLengthPrefixedSlice(iter_.key());
return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
}
virtual Status status() const { return Status::OK(); }
private:
MemTable::Table::Iterator iter_;
std::string tmp_; // For passing to EncodeKey
// No copying allowed
MemTableIterator(const MemTableIterator&);
void operator=(const MemTableIterator&);
};
Iterator* MemTable::NewIterator() {
return new MemTableIterator(&table_);
}
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key,
const Slice& value) {
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
size_t key_size = key.size();
size_t val_size = value.size();
size_t internal_key_size = key_size + 8;
const size_t encoded_len =
VarintLength(internal_key_size) + internal_key_size +
VarintLength(val_size) + val_size;
char* buf = arena_.Allocate(encoded_len);
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
EncodeFixed64(p, (s << 8) | type);
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((p + val_size) - buf == encoded_len);
table_.Insert(buf);
}
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
Slice memkey = key.memtable_key();
Table::Iterator iter(&table_);
iter.Seek(memkey.data());
if (iter.Valid()) {
// entry format is:
// klength varint32
// userkey char[klength]
// tag uint64
// vlength varint32
// value char[vlength]
// Check that it belongs to same user key. We do not check the
// sequence number since the Seek() call above should have skipped
// all entries with overly large sequence numbers.
const char* entry = iter.key();
uint32_t key_length;
const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
if (comparator_.comparator.user_comparator()->Compare(
Slice(key_ptr, key_length - 8),
key.user_key()) == 0) {
// Correct user key
const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
switch (static_cast<ValueType>(tag & 0xff)) {
case kTypeValue: {
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
value->assign(v.data(), v.size());
return true;
}
case kTypeDeletion:
*s = Status::NotFound(Slice());
return true;
}
}
}
return false;
}
} // namespace leveldb

91
src/leveldb/db/memtable.h Normal file
View File

@@ -0,0 +1,91 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
#define STORAGE_LEVELDB_DB_MEMTABLE_H_
#include <string>
#include "leveldb/db.h"
#include "db/dbformat.h"
#include "db/skiplist.h"
#include "util/arena.h"
namespace leveldb {
class InternalKeyComparator;
class Mutex;
class MemTableIterator;
class MemTable {
public:
// MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once.
explicit MemTable(const InternalKeyComparator& comparator);
// Increase reference count.
void Ref() { ++refs_; }
// Drop reference count. Delete if no more references exist.
void Unref() {
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
delete this;
}
}
// Returns an estimate of the number of bytes of data in use by this
// data structure.
//
// REQUIRES: external synchronization to prevent simultaneous
// operations on the same MemTable.
size_t ApproximateMemoryUsage();
// Return an iterator that yields the contents of the memtable.
//
// The caller must ensure that the underlying MemTable remains live
// while the returned iterator is live. The keys returned by this
// iterator are internal keys encoded by AppendInternalKey in the
// db/format.{h,cc} module.
Iterator* NewIterator();
// Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type.
// Typically value will be empty if type==kTypeDeletion.
void Add(SequenceNumber seq, ValueType type,
const Slice& key,
const Slice& value);
// If memtable contains a value for key, store it in *value and return true.
// If memtable contains a deletion for key, store a NotFound() error
// in *status and return true.
// Else, return false.
bool Get(const LookupKey& key, std::string* value, Status* s);
private:
~MemTable(); // Private since only Unref() should be used to delete it
struct KeyComparator {
const InternalKeyComparator comparator;
explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
int operator()(const char* a, const char* b) const;
};
friend class MemTableIterator;
friend class MemTableBackwardIterator;
typedef SkipList<const char*, KeyComparator> Table;
KeyComparator comparator_;
int refs_;
Arena arena_;
Table table_;
// No copying allowed
MemTable(const MemTable&);
void operator=(const MemTable&);
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_

389
src/leveldb/db/repair.cc Normal file
View File

@@ -0,0 +1,389 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// We recover the contents of the descriptor from the other files we find.
// (1) Any log files are first converted to tables
// (2) We scan every table to compute
// (a) smallest/largest for the table
// (b) largest sequence number in the table
// (3) We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#, ...)
// in the table's meta section to speed up ScanTable.
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "leveldb/comparator.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
namespace leveldb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
ipolicy_(options.filter_policy),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
owns_info_log_(options_.info_log != options.info_log),
owns_cache_(options_.block_cache != options.block_cache),
next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, 10);
}
~Repairer() {
delete table_cache_;
if (owns_info_log_) {
delete options_.info_log;
}
if (owns_cache_) {
delete options_.block_cache;
}
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = WriteDescriptor();
}
if (status.ok()) {
unsigned long long bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.file_size;
}
Log(options_.info_log,
"**** Repaired leveldb %s; "
"recovered %d files; %llu bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(),
static_cast<int>(tables_.size()),
bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
InternalKeyComparator const icmp_;
InternalFilterPolicy const ipolicy_;
Options const options_;
bool owns_info_log_;
bool owns_cache_;
TableCache* table_cache_;
VersionEdit edit_;
std::vector<std::string> manifests_;
std::vector<uint64_t> table_numbers_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
Status FindFiles() {
std::vector<std::string> filenames;
Status status = env_->GetChildren(dbname_, &filenames);
if (!status.ok()) {
return status;
}
if (filenames.empty()) {
return Status::IOError(dbname_, "repair found no files");
}
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
if (type == kDescriptorFile) {
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
logs_.push_back(number);
} else if (type == kTableFile) {
table_numbers_.push_back(number);
} else {
// Ignore other files
}
}
}
}
return status;
}
void ConvertLogFilesToTables() {
for (size_t i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
(unsigned long long) logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
Logger* info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) {
// We print error messages for corruption, but continue repairing.
Log(info_log, "Log #%llu: dropping %d bytes; %s",
(unsigned long long) lognum,
static_cast<int>(bytes),
s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
SequentialFile* lfile;
Status status = env_->NewSequentialFile(logname, &lfile);
if (!status.ok()) {
return status;
}
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentially make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(lfile, &reporter, false/*do not checksum*/,
0/*initial_offset*/);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
MemTable* mem = new MemTable(icmp_);
mem->Ref();
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < 12) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, mem);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(options_.info_log, "Log #%llu: ignoring %s",
(unsigned long long) log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
delete lfile;
// Do not record a version edit for this conversion to a Table
// since ExtractMetaData() will also generate edits.
FileMetaData meta;
meta.number = next_file_number_++;
Iterator* iter = mem->NewIterator();
status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
delete iter;
mem->Unref();
mem = NULL;
if (status.ok()) {
if (meta.file_size > 0) {
table_numbers_.push_back(meta.number);
}
}
Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
(unsigned long long) log,
counter,
(unsigned long long) meta.number,
status.ToString().c_str());
return status;
}
void ExtractMetaData() {
std::vector<TableInfo> kept;
for (size_t i = 0; i < table_numbers_.size(); i++) {
TableInfo t;
t.meta.number = table_numbers_[i];
Status status = ScanTable(&t);
if (!status.ok()) {
std::string fname = TableFileName(dbname_, table_numbers_[i]);
Log(options_.info_log, "Table #%llu: ignoring %s",
(unsigned long long) table_numbers_[i],
status.ToString().c_str());
ArchiveFile(fname);
} else {
tables_.push_back(t);
}
}
}
Status ScanTable(TableInfo* t) {
std::string fname = TableFileName(dbname_, t->meta.number);
int counter = 0;
Status status = env_->GetFileSize(fname, &t->meta.file_size);
if (status.ok()) {
Iterator* iter = table_cache_->NewIterator(
ReadOptions(), t->meta.number, t->meta.file_size);
bool empty = true;
ParsedInternalKey parsed;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(options_.info_log, "Table #%llu: unparsable key %s",
(unsigned long long) t->meta.number,
EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
}
Log(options_.info_log, "Table #%llu: %d entries %s",
(unsigned long long) t->meta.number,
counter,
status.ToString().c_str());
return status;
}
Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1);
WritableFile* file;
Status status = env_->NewWritableFile(tmp, &file);
if (!status.ok()) {
return status;
}
SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
edit_.SetComparatorName(icmp_.user_comparator()->Name());
edit_.SetLogNumber(0);
edit_.SetNextFile(next_file_number_);
edit_.SetLastSequence(max_sequence);
for (size_t i = 0; i < tables_.size(); i++) {
// TODO(opt): separate out into multiple levels
const TableInfo& t = tables_[i];
edit_.AddFile(0, t.meta.number, t.meta.file_size,
t.meta.smallest, t.meta.largest);
}
//fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
{
log::Writer log(file);
std::string record;
edit_.EncodeTo(&record);
status = log.AddRecord(record);
}
if (status.ok()) {
status = file->Close();
}
delete file;
file = NULL;
if (!status.ok()) {
env_->DeleteFile(tmp);
} else {
// Discard older manifests
for (size_t i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Install new manifest
status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
if (status.ok()) {
status = SetCurrentFile(env_, dbname_, 1);
} else {
env_->DeleteFile(tmp);
}
}
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != NULL) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
} // namespace
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
} // namespace leveldb

379
src/leveldb/db/skiplist.h Normal file
View File

@@ -0,0 +1,379 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread safety
// -------------
//
// Writes require external synchronization, most likely a mutex.
// Reads require a guarantee that the SkipList will not be destroyed
// while the read is in progress. Apart from that, reads progress
// without any internal locking or synchronization.
//
// Invariants:
//
// (1) Allocated nodes are never deleted until the SkipList is
// destroyed. This is trivially guaranteed by the code since we
// never delete any skip list nodes.
//
// (2) The contents of a Node except for the next/prev pointers are
// immutable after the Node has been linked into the SkipList.
// Only Insert() modifies the list, and it is careful to initialize
// a node and use release-stores to publish the nodes in one or
// more lists.
//
// ... prev vs. next pointer ordering ...
#include <assert.h>
#include <stdlib.h>
#include "port/port.h"
#include "util/arena.h"
#include "util/random.h"
namespace leveldb {
class Arena;
template<typename Key, class Comparator>
class SkipList {
private:
struct Node;
public:
// Create a new SkipList object that will use "cmp" for comparing keys,
// and will allocate memory using "*arena". Objects allocated in the arena
// must remain allocated for the lifetime of the skiplist object.
explicit SkipList(Comparator cmp, Arena* arena);
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(const Key& key);
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const Key& key) const;
// Iteration over the contents of a skip list
class Iterator {
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(const SkipList* list);
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const;
// Returns the key at the current position.
// REQUIRES: Valid()
const Key& key() const;
// Advances to the next position.
// REQUIRES: Valid()
void Next();
// Advances to the previous position.
// REQUIRES: Valid()
void Prev();
// Advance to the first entry with a key >= target
void Seek(const Key& target);
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst();
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast();
private:
const SkipList* list_;
Node* node_;
// Intentionally copyable
};
private:
enum { kMaxHeight = 12 };
// Immutable after construction
Comparator const compare_;
Arena* const arena_; // Arena used for allocations of nodes
Node* const head_;
// Modified only by Insert(). Read racily by readers, but stale
// values are ok.
port::AtomicPointer max_height_; // Height of the entire list
inline int GetMaxHeight() const {
return static_cast<int>(
reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
}
// Read/written only by Insert().
Random rnd_;
Node* NewNode(const Key& key, int height);
int RandomHeight();
bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
// Return true if key is greater than the data stored in "n"
bool KeyIsAfterNode(const Key& key, Node* n) const;
// Return the earliest node that comes at or after key.
// Return NULL if there is no such node.
//
// If prev is non-NULL, fills prev[level] with pointer to previous
// node at "level" for every level in [0..max_height_-1].
Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
// Return the latest node with a key < key.
// Return head_ if there is no such node.
Node* FindLessThan(const Key& key) const;
// Return the last node in the list.
// Return head_ if list is empty.
Node* FindLast() const;
// No copying allowed
SkipList(const SkipList&);
void operator=(const SkipList&);
};
// Implementation details follow
template<typename Key, class Comparator>
struct SkipList<Key,Comparator>::Node {
explicit Node(const Key& k) : key(k) { }
Key const key;
// Accessors/mutators for links. Wrapped in methods so we can
// add the appropriate barriers as necessary.
Node* Next(int n) {
assert(n >= 0);
// Use an 'acquire load' so that we observe a fully initialized
// version of the returned Node.
return reinterpret_cast<Node*>(next_[n].Acquire_Load());
}
void SetNext(int n, Node* x) {
assert(n >= 0);
// Use a 'release store' so that anybody who reads through this
// pointer observes a fully initialized version of the inserted node.
next_[n].Release_Store(x);
}
// No-barrier variants that can be safely used in a few locations.
Node* NoBarrier_Next(int n) {
assert(n >= 0);
return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
}
void NoBarrier_SetNext(int n, Node* x) {
assert(n >= 0);
next_[n].NoBarrier_Store(x);
}
private:
// Array of length equal to the node height. next_[0] is lowest level link.
port::AtomicPointer next_[1];
};
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
char* mem = arena_->AllocateAligned(
sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
return new (mem) Node(key);
}
template<typename Key, class Comparator>
inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
list_ = list;
node_ = NULL;
}
template<typename Key, class Comparator>
inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
return node_ != NULL;
}
template<typename Key, class Comparator>
inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
assert(Valid());
return node_->key;
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Next() {
assert(Valid());
node_ = node_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Prev() {
// Instead of using explicit "prev" links, we just search for the
// last node that falls before key.
assert(Valid());
node_ = list_->FindLessThan(node_->key);
if (node_ == list_->head_) {
node_ = NULL;
}
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
node_ = list_->FindGreaterOrEqual(target, NULL);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
node_ = list_->head_->Next(0);
}
template<typename Key, class Comparator>
inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
node_ = list_->FindLast();
if (node_ == list_->head_) {
node_ = NULL;
}
}
template<typename Key, class Comparator>
int SkipList<Key,Comparator>::RandomHeight() {
// Increase height with probability 1 in kBranching
static const unsigned int kBranching = 4;
int height = 1;
while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
height++;
}
assert(height > 0);
assert(height <= kMaxHeight);
return height;
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
// NULL n is considered infinite
return (n != NULL) && (compare_(n->key, key) < 0);
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != NULL) prev[level] = x;
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node*
SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
assert(x == head_ || compare_(x->key, key) < 0);
Node* next = x->Next(level);
if (next == NULL || compare_(next->key, key) >= 0) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
if (next == NULL) {
if (level == 0) {
return x;
} else {
// Switch to next list
level--;
}
} else {
x = next;
}
}
}
template<typename Key, class Comparator>
SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
: compare_(cmp),
arena_(arena),
head_(NewNode(0 /* any key will do */, kMaxHeight)),
max_height_(reinterpret_cast<void*>(1)),
rnd_(0xdeadbeef) {
for (int i = 0; i < kMaxHeight; i++) {
head_->SetNext(i, NULL);
}
}
template<typename Key, class Comparator>
void SkipList<Key,Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized.
Node* prev[kMaxHeight];
Node* x = FindGreaterOrEqual(key, prev);
// Our data structure does not allow duplicate insertion
assert(x == NULL || !Equal(key, x->key));
int height = RandomHeight();
if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) {
prev[i] = head_;
}
//fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
// It is ok to mutate max_height_ without any synchronization
// with concurrent readers. A concurrent reader that observes
// the new value of max_height_ will see either the old value of
// new level pointers from head_ (NULL), or a new value set in
// the loop below. In the former case the reader will
// immediately drop to the next level since NULL sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
}
x = NewNode(key, height);
for (int i = 0; i < height; i++) {
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
prev[i]->SetNext(i, x);
}
}
template<typename Key, class Comparator>
bool SkipList<Key,Comparator>::Contains(const Key& key) const {
Node* x = FindGreaterOrEqual(key, NULL);
if (x != NULL && Equal(key, x->key)) {
return true;
} else {
return false;
}
}
} // namespace leveldb

View File

@@ -0,0 +1,378 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/skiplist.h"
#include <set>
#include "leveldb/env.h"
#include "util/arena.h"
#include "util/hash.h"
#include "util/random.h"
#include "util/testharness.h"
namespace leveldb {
typedef uint64_t Key;
struct Comparator {
int operator()(const Key& a, const Key& b) const {
if (a < b) {
return -1;
} else if (a > b) {
return +1;
} else {
return 0;
}
}
};
class SkipTest { };
TEST(SkipTest, Empty) {
Arena arena;
Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena);
ASSERT_TRUE(!list.Contains(10));
SkipList<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.SeekToFirst();
ASSERT_TRUE(!iter.Valid());
iter.Seek(100);
ASSERT_TRUE(!iter.Valid());
iter.SeekToLast();
ASSERT_TRUE(!iter.Valid());
}
TEST(SkipTest, InsertAndLookup) {
const int N = 2000;
const int R = 5000;
Random rnd(1000);
std::set<Key> keys;
Arena arena;
Comparator cmp;
SkipList<Key, Comparator> list(cmp, &arena);
for (int i = 0; i < N; i++) {
Key key = rnd.Next() % R;
if (keys.insert(key).second) {
list.Insert(key);
}
}
for (int i = 0; i < R; i++) {
if (list.Contains(i)) {
ASSERT_EQ(keys.count(i), 1);
} else {
ASSERT_EQ(keys.count(i), 0);
}
}
// Simple iterator tests
{
SkipList<Key, Comparator>::Iterator iter(&list);
ASSERT_TRUE(!iter.Valid());
iter.Seek(0);
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToFirst();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.begin()), iter.key());
iter.SeekToLast();
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*(keys.rbegin()), iter.key());
}
// Forward iteration test
for (int i = 0; i < R; i++) {
SkipList<Key, Comparator>::Iterator iter(&list);
iter.Seek(i);
// Compare against model iterator
std::set<Key>::iterator model_iter = keys.lower_bound(i);
for (int j = 0; j < 3; j++) {
if (model_iter == keys.end()) {
ASSERT_TRUE(!iter.Valid());
break;
} else {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
++model_iter;
iter.Next();
}
}
}
// Backward iteration test
{
SkipList<Key, Comparator>::Iterator iter(&list);
iter.SeekToLast();
// Compare against model iterator
for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
model_iter != keys.rend();
++model_iter) {
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(*model_iter, iter.key());
iter.Prev();
}
ASSERT_TRUE(!iter.Valid());
}
}
// We want to make sure that with a single writer and multiple
// concurrent readers (with no synchronization other than when a
// reader's iterator is created), the reader always observes all the
// data that was present in the skip list when the iterator was
// constructor. Because insertions are happening concurrently, we may
// also observe new values that were inserted since the iterator was
// constructed, but we should never miss any values that were present
// at iterator construction time.
//
// We generate multi-part keys:
// <key,gen,hash>
// where:
// key is in range [0..K-1]
// gen is a generation number for key
// hash is hash(key,gen)
//
// The insertion code picks a random key, sets gen to be 1 + the last
// generation number inserted for that key, and sets hash to Hash(key,gen).
//
// At the beginning of a read, we snapshot the last inserted
// generation number for each key. We then iterate, including random
// calls to Next() and Seek(). For every key we encounter, we
// check that it is either expected given the initial snapshot or has
// been concurrently added since the iterator started.
class ConcurrentTest {
private:
static const uint32_t K = 4;
static uint64_t key(Key key) { return (key >> 40); }
static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
static uint64_t hash(Key key) { return key & 0xff; }
static uint64_t HashNumbers(uint64_t k, uint64_t g) {
uint64_t data[2] = { k, g };
return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
}
static Key MakeKey(uint64_t k, uint64_t g) {
assert(sizeof(Key) == sizeof(uint64_t));
assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
assert(g <= 0xffffffffu);
return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
}
static bool IsValidKey(Key k) {
return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
}
static Key RandomTarget(Random* rnd) {
switch (rnd->Next() % 10) {
case 0:
// Seek to beginning
return MakeKey(0, 0);
case 1:
// Seek to end
return MakeKey(K, 0);
default:
// Seek to middle
return MakeKey(rnd->Next() % K, 0);
}
}
// Per-key generation
struct State {
port::AtomicPointer generation[K];
void Set(int k, intptr_t v) {
generation[k].Release_Store(reinterpret_cast<void*>(v));
}
intptr_t Get(int k) {
return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
}
State() {
for (int k = 0; k < K; k++) {
Set(k, 0);
}
}
};
// Current state of the test
State current_;
Arena arena_;
// SkipList is not protected by mu_. We just use a single writer
// thread to modify it.
SkipList<Key, Comparator> list_;
public:
ConcurrentTest() : list_(Comparator(), &arena_) { }
// REQUIRES: External synchronization
void WriteStep(Random* rnd) {
const uint32_t k = rnd->Next() % K;
const intptr_t g = current_.Get(k) + 1;
const Key key = MakeKey(k, g);
list_.Insert(key);
current_.Set(k, g);
}
void ReadStep(Random* rnd) {
// Remember the initial committed state of the skiplist.
State initial_state;
for (int k = 0; k < K; k++) {
initial_state.Set(k, current_.Get(k));
}
Key pos = RandomTarget(rnd);
SkipList<Key, Comparator>::Iterator iter(&list_);
iter.Seek(pos);
while (true) {
Key current;
if (!iter.Valid()) {
current = MakeKey(K, 0);
} else {
current = iter.key();
ASSERT_TRUE(IsValidKey(current)) << current;
}
ASSERT_LE(pos, current) << "should not go backwards";
// Verify that everything in [pos,current) was not present in
// initial_state.
while (pos < current) {
ASSERT_LT(key(pos), K) << pos;
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0) ||
(gen(pos) > initial_state.Get(key(pos)))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
<< initial_state.Get(key(pos));
// Advance to next key in the valid key space
if (key(pos) < key(current)) {
pos = MakeKey(key(pos) + 1, 0);
} else {
pos = MakeKey(key(pos), gen(pos) + 1);
}
}
if (!iter.Valid()) {
break;
}
if (rnd->Next() % 2) {
iter.Next();
pos = MakeKey(key(pos), gen(pos) + 1);
} else {
Key new_target = RandomTarget(rnd);
if (new_target > pos) {
pos = new_target;
iter.Seek(new_target);
}
}
}
}
};
const uint32_t ConcurrentTest::K;
// Simple test that does single-threaded testing of the ConcurrentTest
// scaffolding.
TEST(SkipTest, ConcurrentWithoutThreads) {
ConcurrentTest test;
Random rnd(test::RandomSeed());
for (int i = 0; i < 10000; i++) {
test.ReadStep(&rnd);
test.WriteStep(&rnd);
}
}
class TestState {
public:
ConcurrentTest t_;
int seed_;
port::AtomicPointer quit_flag_;
enum ReaderState {
STARTING,
RUNNING,
DONE
};
explicit TestState(int s)
: seed_(s),
quit_flag_(NULL),
state_(STARTING),
state_cv_(&mu_) {}
void Wait(ReaderState s) {
mu_.Lock();
while (state_ != s) {
state_cv_.Wait();
}
mu_.Unlock();
}
void Change(ReaderState s) {
mu_.Lock();
state_ = s;
state_cv_.Signal();
mu_.Unlock();
}
private:
port::Mutex mu_;
ReaderState state_;
port::CondVar state_cv_;
};
static void ConcurrentReader(void* arg) {
TestState* state = reinterpret_cast<TestState*>(arg);
Random rnd(state->seed_);
int64_t reads = 0;
state->Change(TestState::RUNNING);
while (!state->quit_flag_.Acquire_Load()) {
state->t_.ReadStep(&rnd);
++reads;
}
state->Change(TestState::DONE);
}
static void RunConcurrent(int run) {
const int seed = test::RandomSeed() + (run * 100);
Random rnd(seed);
const int N = 1000;
const int kSize = 1000;
for (int i = 0; i < N; i++) {
if ((i % 100) == 0) {
fprintf(stderr, "Run %d of %d\n", i, N);
}
TestState state(seed + 1);
Env::Default()->Schedule(ConcurrentReader, &state);
state.Wait(TestState::RUNNING);
for (int i = 0; i < kSize; i++) {
state.t_.WriteStep(&rnd);
}
state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do
state.Wait(TestState::DONE);
}
}
TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

66
src/leveldb/db/snapshot.h Normal file
View File

@@ -0,0 +1,66 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
#include "leveldb/db.h"
namespace leveldb {
class SnapshotList;
// Snapshots are kept in a doubly-linked list in the DB.
// Each SnapshotImpl corresponds to a particular sequence number.
class SnapshotImpl : public Snapshot {
public:
SequenceNumber number_; // const after creation
private:
friend class SnapshotList;
// SnapshotImpl is kept in a doubly-linked circular list
SnapshotImpl* prev_;
SnapshotImpl* next_;
SnapshotList* list_; // just for sanity checks
};
class SnapshotList {
public:
SnapshotList() {
list_.prev_ = &list_;
list_.next_ = &list_;
}
bool empty() const { return list_.next_ == &list_; }
SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
const SnapshotImpl* New(SequenceNumber seq) {
SnapshotImpl* s = new SnapshotImpl;
s->number_ = seq;
s->list_ = this;
s->next_ = &list_;
s->prev_ = list_.prev_;
s->prev_->next_ = s;
s->next_->prev_ = s;
return s;
}
void Delete(const SnapshotImpl* s) {
assert(s->list_ == this);
s->prev_->next_ = s->next_;
s->next_->prev_ = s->prev_;
delete s;
}
private:
// Dummy head of doubly-linked list of snapshots
SnapshotImpl list_;
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_

View File

@@ -0,0 +1,121 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/table_cache.h"
#include "db/filename.h"
#include "leveldb/env.h"
#include "leveldb/table.h"
#include "util/coding.h"
namespace leveldb {
struct TableAndFile {
RandomAccessFile* file;
Table* table;
};
static void DeleteEntry(const Slice& key, void* value) {
TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
delete tf->table;
delete tf->file;
delete tf;
}
static void UnrefEntry(void* arg1, void* arg2) {
Cache* cache = reinterpret_cast<Cache*>(arg1);
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
cache->Release(h);
}
TableCache::TableCache(const std::string& dbname,
const Options* options,
int entries)
: env_(options->env),
dbname_(dbname),
options_(options),
cache_(NewLRUCache(entries)) {
}
TableCache::~TableCache() {
delete cache_;
}
Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
Cache::Handle** handle) {
Status s;
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
Slice key(buf, sizeof(buf));
*handle = cache_->Lookup(key);
if (*handle == NULL) {
std::string fname = TableFileName(dbname_, file_number);
RandomAccessFile* file = NULL;
Table* table = NULL;
s = env_->NewRandomAccessFile(fname, &file);
if (s.ok()) {
s = Table::Open(*options_, file, file_size, &table);
}
if (!s.ok()) {
assert(table == NULL);
delete file;
// We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically.
} else {
TableAndFile* tf = new TableAndFile;
tf->file = file;
tf->table = table;
*handle = cache_->Insert(key, tf, 1, &DeleteEntry);
}
}
return s;
}
Iterator* TableCache::NewIterator(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
Table** tableptr) {
if (tableptr != NULL) {
*tableptr = NULL;
}
Cache::Handle* handle = NULL;
Status s = FindTable(file_number, file_size, &handle);
if (!s.ok()) {
return NewErrorIterator(s);
}
Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
Iterator* result = table->NewIterator(options);
result->RegisterCleanup(&UnrefEntry, cache_, handle);
if (tableptr != NULL) {
*tableptr = table;
}
return result;
}
Status TableCache::Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
void (*saver)(void*, const Slice&, const Slice&)) {
Cache::Handle* handle = NULL;
Status s = FindTable(file_number, file_size, &handle);
if (s.ok()) {
Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
s = t->InternalGet(options, k, arg, saver);
cache_->Release(handle);
}
return s;
}
void TableCache::Evict(uint64_t file_number) {
char buf[sizeof(file_number)];
EncodeFixed64(buf, file_number);
cache_->Erase(Slice(buf, sizeof(buf)));
}
} // namespace leveldb

View File

@@ -0,0 +1,61 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread-safe (provides internal synchronization)
#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
#include <string>
#include <stdint.h>
#include "db/dbformat.h"
#include "leveldb/cache.h"
#include "leveldb/table.h"
#include "port/port.h"
namespace leveldb {
class Env;
class TableCache {
public:
TableCache(const std::string& dbname, const Options* options, int entries);
~TableCache();
// Return an iterator for the specified file number (the corresponding
// file length must be exactly "file_size" bytes). If "tableptr" is
// non-NULL, also sets "*tableptr" to point to the Table object
// underlying the returned iterator, or NULL if no Table object underlies
// the returned iterator. The returned "*tableptr" object is owned by
// the cache and should not be deleted, and is valid for as long as the
// returned iterator is live.
Iterator* NewIterator(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
Table** tableptr = NULL);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value).
Status Get(const ReadOptions& options,
uint64_t file_number,
uint64_t file_size,
const Slice& k,
void* arg,
void (*handle_result)(void*, const Slice&, const Slice&));
// Evict any entry for the specified file number
void Evict(uint64_t file_number);
private:
Env* const env_;
const std::string dbname_;
const Options* options_;
Cache* cache_;
Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_

View File

@@ -0,0 +1,266 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "db/version_set.h"
#include "util/coding.h"
namespace leveldb {
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed.
enum Tag {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
// 8 was used for large value refs
kPrevLogNumber = 9
};
void VersionEdit::Clear() {
comparator_.clear();
log_number_ = 0;
prev_log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
has_comparator_ = false;
has_log_number_ = false;
has_prev_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
deleted_files_.clear();
new_files_.clear();
}
void VersionEdit::EncodeTo(std::string* dst) const {
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
}
if (has_log_number_) {
PutVarint32(dst, kLogNumber);
PutVarint64(dst, log_number_);
}
if (has_prev_log_number_) {
PutVarint32(dst, kPrevLogNumber);
PutVarint64(dst, prev_log_number_);
}
if (has_next_file_number_) {
PutVarint32(dst, kNextFileNumber);
PutVarint64(dst, next_file_number_);
}
if (has_last_sequence_) {
PutVarint32(dst, kLastSequence);
PutVarint64(dst, last_sequence_);
}
for (size_t i = 0; i < compact_pointers_.size(); i++) {
PutVarint32(dst, kCompactPointer);
PutVarint32(dst, compact_pointers_[i].first); // level
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
PutVarint32(dst, kDeletedFile);
PutVarint32(dst, iter->first); // level
PutVarint64(dst, iter->second); // file number
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
PutVarint32(dst, kNewFile);
PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.number);
PutVarint64(dst, f.file_size);
PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode());
}
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
Slice str;
if (GetLengthPrefixedSlice(input, &str)) {
dst->DecodeFrom(str);
return true;
} else {
return false;
}
}
static bool GetLevel(Slice* input, int* level) {
uint32_t v;
if (GetVarint32(input, &v) &&
v < config::kNumLevels) {
*level = v;
return true;
} else {
return false;
}
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
const char* msg = NULL;
uint32_t tag;
// Temporary storage for parsing
int level;
uint64_t number;
FileMetaData f;
Slice str;
InternalKey key;
while (msg == NULL && GetVarint32(&input, &tag)) {
switch (tag) {
case kComparator:
if (GetLengthPrefixedSlice(&input, &str)) {
comparator_ = str.ToString();
has_comparator_ = true;
} else {
msg = "comparator name";
}
break;
case kLogNumber:
if (GetVarint64(&input, &log_number_)) {
has_log_number_ = true;
} else {
msg = "log number";
}
break;
case kPrevLogNumber:
if (GetVarint64(&input, &prev_log_number_)) {
has_prev_log_number_ = true;
} else {
msg = "previous log number";
}
break;
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
} else {
msg = "next file number";
}
break;
case kLastSequence:
if (GetVarint64(&input, &last_sequence_)) {
has_last_sequence_ = true;
} else {
msg = "last sequence number";
}
break;
case kCompactPointer:
if (GetLevel(&input, &level) &&
GetInternalKey(&input, &key)) {
compact_pointers_.push_back(std::make_pair(level, key));
} else {
msg = "compaction pointer";
}
break;
case kDeletedFile:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
msg = "deleted file";
}
break;
case kNewFile:
if (GetLevel(&input, &level) &&
GetVarint64(&input, &f.number) &&
GetVarint64(&input, &f.file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) {
new_files_.push_back(std::make_pair(level, f));
} else {
msg = "new-file entry";
}
break;
default:
msg = "unknown tag";
break;
}
}
if (msg == NULL && !input.empty()) {
msg = "invalid tag";
}
Status result;
if (msg != NULL) {
result = Status::Corruption("VersionEdit", msg);
}
return result;
}
std::string VersionEdit::DebugString() const {
std::string r;
r.append("VersionEdit {");
if (has_comparator_) {
r.append("\n Comparator: ");
r.append(comparator_);
}
if (has_log_number_) {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
if (has_prev_log_number_) {
r.append("\n PrevLogNumber: ");
AppendNumberTo(&r, prev_log_number_);
}
if (has_next_file_number_) {
r.append("\n NextFile: ");
AppendNumberTo(&r, next_file_number_);
}
if (has_last_sequence_) {
r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_);
}
for (size_t i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" ");
r.append(compact_pointers_[i].second.DebugString());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
r.append("\n DeleteFile: ");
AppendNumberTo(&r, iter->first);
r.append(" ");
AppendNumberTo(&r, iter->second);
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first);
r.append(" ");
AppendNumberTo(&r, f.number);
r.append(" ");
AppendNumberTo(&r, f.file_size);
r.append(" ");
r.append(f.smallest.DebugString());
r.append(" .. ");
r.append(f.largest.DebugString());
}
r.append("\n}\n");
return r;
}
} // namespace leveldb

View File

@@ -0,0 +1,107 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
#include <set>
#include <utility>
#include <vector>
#include "db/dbformat.h"
namespace leveldb {
class VersionSet;
struct FileMetaData {
int refs;
int allowed_seeks; // Seeks allowed until compaction
uint64_t number;
uint64_t file_size; // File size in bytes
InternalKey smallest; // Smallest internal key served by table
InternalKey largest; // Largest internal key served by table
FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
};
class VersionEdit {
public:
VersionEdit() { Clear(); }
~VersionEdit() { }
void Clear();
void SetComparatorName(const Slice& name) {
has_comparator_ = true;
comparator_ = name.ToString();
}
void SetLogNumber(uint64_t num) {
has_log_number_ = true;
log_number_ = num;
}
void SetPrevLogNumber(uint64_t num) {
has_prev_log_number_ = true;
prev_log_number_ = num;
}
void SetNextFile(uint64_t num) {
has_next_file_number_ = true;
next_file_number_ = num;
}
void SetLastSequence(SequenceNumber seq) {
has_last_sequence_ = true;
last_sequence_ = seq;
}
void SetCompactPointer(int level, const InternalKey& key) {
compact_pointers_.push_back(std::make_pair(level, key));
}
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file,
uint64_t file_size,
const InternalKey& smallest,
const InternalKey& largest) {
FileMetaData f;
f.number = file;
f.file_size = file_size;
f.smallest = smallest;
f.largest = largest;
new_files_.push_back(std::make_pair(level, f));
}
// Delete the specified "file" from the specified "level".
void DeleteFile(int level, uint64_t file) {
deleted_files_.insert(std::make_pair(level, file));
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
std::string DebugString() const;
private:
friend class VersionSet;
typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
std::string comparator_;
uint64_t log_number_;
uint64_t prev_log_number_;
uint64_t next_file_number_;
SequenceNumber last_sequence_;
bool has_comparator_;
bool has_log_number_;
bool has_prev_log_number_;
bool has_next_file_number_;
bool has_last_sequence_;
std::vector< std::pair<int, InternalKey> > compact_pointers_;
DeletedFileSet deleted_files_;
std::vector< std::pair<int, FileMetaData> > new_files_;
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_

View File

@@ -0,0 +1,46 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "util/testharness.h"
namespace leveldb {
static void TestEncodeDecode(const VersionEdit& edit) {
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2);
ASSERT_EQ(encoded, encoded2);
}
class VersionEditTest { };
TEST(VersionEditTest, EncodeDecode) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
InternalKey("foo", kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
edit.DeleteFile(4, kBig + 700 + i);
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
}
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,398 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// The representation of a DBImpl consists of a set of Versions. The
// newest version is called "current". Older versions may be kept
// around to provide a consistent view to live iterators.
//
// Each Version keeps track of a set of Table files per level. The
// entire set of versions is maintained in a VersionSet.
//
// Version,VersionSet are thread-compatible, but require external
// synchronization on all accesses.
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
#define STORAGE_LEVELDB_DB_VERSION_SET_H_
#include <map>
#include <set>
#include <vector>
#include "db/dbformat.h"
#include "db/version_edit.h"
#include "port/port.h"
#include "port/thread_annotations.h"
namespace leveldb {
namespace log { class Writer; }
class Compaction;
class Iterator;
class MemTable;
class TableBuilder;
class TableCache;
class Version;
class VersionSet;
class WritableFile;
// Return the smallest index i such that files[i]->largest >= key.
// Return files.size() if there is no such file.
// REQUIRES: "files" contains a sorted list of non-overlapping files.
extern int FindFile(const InternalKeyComparator& icmp,
const std::vector<FileMetaData*>& files,
const Slice& key);
// Returns true iff some file in "files" overlaps the user key range
// [*smallest,*largest].
// smallest==NULL represents a key smaller than all keys in the DB.
// largest==NULL represents a key largest than all keys in the DB.
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
// in sorted order.
extern bool SomeFileOverlapsRange(
const InternalKeyComparator& icmp,
bool disjoint_sorted_files,
const std::vector<FileMetaData*>& files,
const Slice* smallest_user_key,
const Slice* largest_user_key);
class Version {
public:
// Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
// Lookup the value for key. If found, store it in *val and
// return OK. Else return a non-OK status. Fills *stats.
// REQUIRES: lock is not held
struct GetStats {
FileMetaData* seek_file;
int seek_file_level;
};
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
GetStats* stats);
// Adds "stats" into the current state. Returns true if a new
// compaction may need to be triggered, false otherwise.
// REQUIRES: lock is held
bool UpdateStats(const GetStats& stats);
// Record a sample of bytes read at the specified internal key.
// Samples are taken approximately once every config::kReadBytesPeriod
// bytes. Returns true if a new compaction may need to be triggered.
// REQUIRES: lock is held
bool RecordReadSample(Slice key);
// Reference count management (so Versions do not disappear out from
// under live iterators)
void Ref();
void Unref();
void GetOverlappingInputs(
int level,
const InternalKey* begin, // NULL means before all keys
const InternalKey* end, // NULL means after all keys
std::vector<FileMetaData*>* inputs);
// Returns true iff some file in the specified level overlaps
// some part of [*smallest_user_key,*largest_user_key].
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
// largest_user_key==NULL represents a key largest than all keys in the DB.
bool OverlapInLevel(int level,
const Slice* smallest_user_key,
const Slice* largest_user_key);
// Return the level at which we should place a new memtable compaction
// result that covers the range [smallest_user_key,largest_user_key].
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key);
int NumFiles(int level) const { return files_[level].size(); }
// Return a human readable string that describes this version's contents.
std::string DebugString() const;
private:
friend class Compaction;
friend class VersionSet;
class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
// Call func(arg, level, f) for every file that overlaps user_key in
// order from newest to oldest. If an invocation of func returns
// false, makes no more calls.
//
// REQUIRES: user portion of internal_key == user_key.
void ForEachOverlapping(Slice user_key, Slice internal_key,
void* arg,
bool (*func)(void*, int, FileMetaData*));
VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version
// List of files per level
std::vector<FileMetaData*> files_[config::kNumLevels];
// Next file to compact based on seek stats.
FileMetaData* file_to_compact_;
int file_to_compact_level_;
// Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields
// are initialized by Finalize().
double compaction_score_;
int compaction_level_;
explicit Version(VersionSet* vset)
: vset_(vset), next_(this), prev_(this), refs_(0),
file_to_compact_(NULL),
file_to_compact_level_(-1),
compaction_score_(-1),
compaction_level_(-1) {
}
~Version();
// No copying allowed
Version(const Version&);
void operator=(const Version&);
};
class VersionSet {
public:
VersionSet(const std::string& dbname,
const Options* options,
TableCache* table_cache,
const InternalKeyComparator*);
~VersionSet();
// Apply *edit to the current version to form a new descriptor that
// is both saved to persistent state and installed as the new
// current version. Will release *mu while actually writing to the file.
// REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
EXCLUSIVE_LOCKS_REQUIRED(mu);
// Recover the last saved descriptor from persistent storage.
Status Recover();
// Return the current version.
Version* current() const { return current_; }
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
// Allocate and return a new file number
uint64_t NewFileNumber() { return next_file_number_++; }
// Arrange to reuse "file_number" unless a newer file number has
// already been allocated.
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
void ReuseFileNumber(uint64_t file_number) {
if (next_file_number_ == file_number + 1) {
next_file_number_ = file_number;
}
}
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return the last sequence number.
uint64_t LastSequence() const { return last_sequence_; }
// Set the last sequence number to s.
void SetLastSequence(uint64_t s) {
assert(s >= last_sequence_);
last_sequence_ = s;
}
// Mark the specified file number as used.
void MarkFileNumberUsed(uint64_t number);
// Return the current log file number.
uint64_t LogNumber() const { return log_number_; }
// Return the log file number for the log file that is currently
// being compacted, or zero if there is no such log file.
uint64_t PrevLogNumber() const { return prev_log_number_; }
// Pick level and inputs for a new compaction.
// Returns NULL if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
Compaction* PickCompaction();
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns NULL if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
Compaction* CompactRange(
int level,
const InternalKey* begin,
const InternalKey* end);
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t MaxNextLevelOverlappingBytes();
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
// Returns true iff some level needs a compaction.
bool NeedsCompaction() const {
Version* v = current_;
return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL);
}
// Add all files listed in any live version to *live.
// May also mutate some internal state.
void AddLiveFiles(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for
// "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
private:
class Builder;
friend class Compaction;
friend class Version;
void Finalize(Version* v);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
void GetRange2(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest,
InternalKey* largest);
void SetupOtherInputs(Compaction* c);
// Save current contents to *log
Status WriteSnapshot(log::Writer* log);
void AppendVersion(Version* v);
bool ManifestContains(const std::string& record) const;
Env* const env_;
const std::string dbname_;
const Options* const options_;
TableCache* const table_cache_;
const InternalKeyComparator icmp_;
uint64_t next_file_number_;
uint64_t manifest_file_number_;
uint64_t last_sequence_;
uint64_t log_number_;
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
// Opened lazily
WritableFile* descriptor_file_;
log::Writer* descriptor_log_;
Version dummy_versions_; // Head of circular doubly-linked list of versions.
Version* current_; // == dummy_versions_.prev_
// Per-level key at which the next compaction at that level should start.
// Either an empty string, or a valid InternalKey.
std::string compact_pointer_[config::kNumLevels];
// No copying allowed
VersionSet(const VersionSet&);
void operator=(const VersionSet&);
};
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// and "level+1" will be merged to produce a set of "level+1" files.
int level() const { return level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return &edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
private:
friend class Version;
friend class VersionSet;
explicit Compaction(int level);
int level_;
uint64_t max_output_file_size_;
Version* input_version_;
VersionEdit edit_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
int64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
// State for implementing IsBaseLevelForKey
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
size_t level_ptrs_[config::kNumLevels];
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_

View File

@@ -0,0 +1,179 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_set.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace leveldb {
class FindFileTest {
public:
std::vector<FileMetaData*> files_;
bool disjoint_sorted_files_;
FindFileTest() : disjoint_sorted_files_(true) { }
~FindFileTest() {
for (int i = 0; i < files_.size(); i++) {
delete files_[i];
}
}
void Add(const char* smallest, const char* largest,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100) {
FileMetaData* f = new FileMetaData;
f->number = files_.size() + 1;
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
files_.push_back(f);
}
int Find(const char* key) {
InternalKey target(key, 100, kTypeValue);
InternalKeyComparator cmp(BytewiseComparator());
return FindFile(cmp, files_, target.Encode());
}
bool Overlaps(const char* smallest, const char* largest) {
InternalKeyComparator cmp(BytewiseComparator());
Slice s(smallest != NULL ? smallest : "");
Slice l(largest != NULL ? largest : "");
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
(smallest != NULL ? &s : NULL),
(largest != NULL ? &l : NULL));
}
};
TEST(FindFileTest, Empty) {
ASSERT_EQ(0, Find("foo"));
ASSERT_TRUE(! Overlaps("a", "z"));
ASSERT_TRUE(! Overlaps(NULL, "z"));
ASSERT_TRUE(! Overlaps("a", NULL));
ASSERT_TRUE(! Overlaps(NULL, NULL));
}
TEST(FindFileTest, Single) {
Add("p", "q");
ASSERT_EQ(0, Find("a"));
ASSERT_EQ(0, Find("p"));
ASSERT_EQ(0, Find("p1"));
ASSERT_EQ(0, Find("q"));
ASSERT_EQ(1, Find("q1"));
ASSERT_EQ(1, Find("z"));
ASSERT_TRUE(! Overlaps("a", "b"));
ASSERT_TRUE(! Overlaps("z1", "z2"));
ASSERT_TRUE(Overlaps("a", "p"));
ASSERT_TRUE(Overlaps("a", "q"));
ASSERT_TRUE(Overlaps("a", "z"));
ASSERT_TRUE(Overlaps("p", "p1"));
ASSERT_TRUE(Overlaps("p", "q"));
ASSERT_TRUE(Overlaps("p", "z"));
ASSERT_TRUE(Overlaps("p1", "p2"));
ASSERT_TRUE(Overlaps("p1", "z"));
ASSERT_TRUE(Overlaps("q", "q"));
ASSERT_TRUE(Overlaps("q", "q1"));
ASSERT_TRUE(! Overlaps(NULL, "j"));
ASSERT_TRUE(! Overlaps("r", NULL));
ASSERT_TRUE(Overlaps(NULL, "p"));
ASSERT_TRUE(Overlaps(NULL, "p1"));
ASSERT_TRUE(Overlaps("q", NULL));
ASSERT_TRUE(Overlaps(NULL, NULL));
}
TEST(FindFileTest, Multiple) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_EQ(0, Find("100"));
ASSERT_EQ(0, Find("150"));
ASSERT_EQ(0, Find("151"));
ASSERT_EQ(0, Find("199"));
ASSERT_EQ(0, Find("200"));
ASSERT_EQ(1, Find("201"));
ASSERT_EQ(1, Find("249"));
ASSERT_EQ(1, Find("250"));
ASSERT_EQ(2, Find("251"));
ASSERT_EQ(2, Find("299"));
ASSERT_EQ(2, Find("300"));
ASSERT_EQ(2, Find("349"));
ASSERT_EQ(2, Find("350"));
ASSERT_EQ(3, Find("351"));
ASSERT_EQ(3, Find("400"));
ASSERT_EQ(3, Find("450"));
ASSERT_EQ(4, Find("451"));
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("251", "299"));
ASSERT_TRUE(! Overlaps("451", "500"));
ASSERT_TRUE(! Overlaps("351", "399"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
}
TEST(FindFileTest, MultipleNullBoundaries) {
Add("150", "200");
Add("200", "250");
Add("300", "350");
Add("400", "450");
ASSERT_TRUE(! Overlaps(NULL, "149"));
ASSERT_TRUE(! Overlaps("451", NULL));
ASSERT_TRUE(Overlaps(NULL, NULL));
ASSERT_TRUE(Overlaps(NULL, "150"));
ASSERT_TRUE(Overlaps(NULL, "199"));
ASSERT_TRUE(Overlaps(NULL, "200"));
ASSERT_TRUE(Overlaps(NULL, "201"));
ASSERT_TRUE(Overlaps(NULL, "400"));
ASSERT_TRUE(Overlaps(NULL, "800"));
ASSERT_TRUE(Overlaps("100", NULL));
ASSERT_TRUE(Overlaps("200", NULL));
ASSERT_TRUE(Overlaps("449", NULL));
ASSERT_TRUE(Overlaps("450", NULL));
}
TEST(FindFileTest, OverlapSequenceChecks) {
Add("200", "200", 5000, 3000);
ASSERT_TRUE(! Overlaps("199", "199"));
ASSERT_TRUE(! Overlaps("201", "300"));
ASSERT_TRUE(Overlaps("200", "200"));
ASSERT_TRUE(Overlaps("190", "200"));
ASSERT_TRUE(Overlaps("200", "210"));
}
TEST(FindFileTest, OverlappingFiles) {
Add("150", "600");
Add("400", "500");
disjoint_sorted_files_ = false;
ASSERT_TRUE(! Overlaps("100", "149"));
ASSERT_TRUE(! Overlaps("601", "700"));
ASSERT_TRUE(Overlaps("100", "150"));
ASSERT_TRUE(Overlaps("100", "200"));
ASSERT_TRUE(Overlaps("100", "300"));
ASSERT_TRUE(Overlaps("100", "400"));
ASSERT_TRUE(Overlaps("100", "500"));
ASSERT_TRUE(Overlaps("375", "400"));
ASSERT_TRUE(Overlaps("450", "450"));
ASSERT_TRUE(Overlaps("450", "500"));
ASSERT_TRUE(Overlaps("450", "700"));
ASSERT_TRUE(Overlaps("600", "700"));
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@@ -0,0 +1,147 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring |
// kTypeDeletion varstring
// varstring :=
// len: varint32
// data: uint8[len]
#include "leveldb/write_batch.h"
#include "leveldb/db.h"
#include "db/dbformat.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "util/coding.h"
namespace leveldb {
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
static const size_t kHeader = 12;
WriteBatch::WriteBatch() {
Clear();
}
WriteBatch::~WriteBatch() { }
WriteBatch::Handler::~Handler() { }
void WriteBatch::Clear() {
rep_.clear();
rep_.resize(kHeader);
}
Status WriteBatch::Iterate(Handler* handler) const {
Slice input(rep_);
if (input.size() < kHeader) {
return Status::Corruption("malformed WriteBatch (too small)");
}
input.remove_prefix(kHeader);
Slice key, value;
int found = 0;
while (!input.empty()) {
found++;
char tag = input[0];
input.remove_prefix(1);
switch (tag) {
case kTypeValue:
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) {
handler->Put(key, value);
} else {
return Status::Corruption("bad WriteBatch Put");
}
break;
case kTypeDeletion:
if (GetLengthPrefixedSlice(&input, &key)) {
handler->Delete(key);
} else {
return Status::Corruption("bad WriteBatch Delete");
}
break;
default:
return Status::Corruption("unknown WriteBatch tag");
}
}
if (found != WriteBatchInternal::Count(this)) {
return Status::Corruption("WriteBatch has wrong count");
} else {
return Status::OK();
}
}
int WriteBatchInternal::Count(const WriteBatch* b) {
return DecodeFixed32(b->rep_.data() + 8);
}
void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
EncodeFixed32(&b->rep_[8], n);
}
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
return SequenceNumber(DecodeFixed64(b->rep_.data()));
}
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
EncodeFixed64(&b->rep_[0], seq);
}
void WriteBatch::Put(const Slice& key, const Slice& value) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeValue));
PutLengthPrefixedSlice(&rep_, key);
PutLengthPrefixedSlice(&rep_, value);
}
void WriteBatch::Delete(const Slice& key) {
WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
rep_.push_back(static_cast<char>(kTypeDeletion));
PutLengthPrefixedSlice(&rep_, key);
}
namespace {
class MemTableInserter : public WriteBatch::Handler {
public:
SequenceNumber sequence_;
MemTable* mem_;
virtual void Put(const Slice& key, const Slice& value) {
mem_->Add(sequence_, kTypeValue, key, value);
sequence_++;
}
virtual void Delete(const Slice& key) {
mem_->Add(sequence_, kTypeDeletion, key, Slice());
sequence_++;
}
};
} // namespace
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
MemTable* memtable) {
MemTableInserter inserter;
inserter.sequence_ = WriteBatchInternal::Sequence(b);
inserter.mem_ = memtable;
return b->Iterate(&inserter);
}
void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
assert(contents.size() >= kHeader);
b->rep_.assign(contents.data(), contents.size());
}
void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
SetCount(dst, Count(dst) + Count(src));
assert(src->rep_.size() >= kHeader);
dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
}
} // namespace leveldb

View File

@@ -0,0 +1,49 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#include "leveldb/write_batch.h"
namespace leveldb {
class MemTable;
// WriteBatchInternal provides static methods for manipulating a
// WriteBatch that we don't want in the public WriteBatch interface.
class WriteBatchInternal {
public:
// Return the number of entries in the batch.
static int Count(const WriteBatch* batch);
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n);
// Return the seqeunce number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the seqeunce number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
static Slice Contents(const WriteBatch* batch) {
return Slice(batch->rep_);
}
static size_t ByteSize(const WriteBatch* batch) {
return batch->rep_.size();
}
static void SetContents(WriteBatch* batch, const Slice& contents);
static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
static void Append(WriteBatch* dst, const WriteBatch* src);
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_

View File

@@ -0,0 +1,120 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "leveldb/db.h"
#include "db/memtable.h"
#include "db/write_batch_internal.h"
#include "leveldb/env.h"
#include "util/logging.h"
#include "util/testharness.h"
namespace leveldb {
static std::string PrintContents(WriteBatch* b) {
InternalKeyComparator cmp(BytewiseComparator());
MemTable* mem = new MemTable(cmp);
mem->Ref();
std::string state;
Status s = WriteBatchInternal::InsertInto(b, mem);
int count = 0;
Iterator* iter = mem->NewIterator();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
switch (ikey.type) {
case kTypeValue:
state.append("Put(");
state.append(ikey.user_key.ToString());
state.append(", ");
state.append(iter->value().ToString());
state.append(")");
count++;
break;
case kTypeDeletion:
state.append("Delete(");
state.append(ikey.user_key.ToString());
state.append(")");
count++;
break;
}
state.append("@");
state.append(NumberToString(ikey.sequence));
}
delete iter;
if (!s.ok()) {
state.append("ParseError()");
} else if (count != WriteBatchInternal::Count(b)) {
state.append("CountMismatch()");
}
mem->Unref();
return state;
}
class WriteBatchTest { };
TEST(WriteBatchTest, Empty) {
WriteBatch batch;
ASSERT_EQ("", PrintContents(&batch));
ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
}
TEST(WriteBatchTest, Multiple) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
batch.Put(Slice("baz"), Slice("boo"));
WriteBatchInternal::SetSequence(&batch, 100);
ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
ASSERT_EQ("Put(baz, boo)@102"
"Delete(box)@101"
"Put(foo, bar)@100",
PrintContents(&batch));
}
TEST(WriteBatchTest, Corruption) {
WriteBatch batch;
batch.Put(Slice("foo"), Slice("bar"));
batch.Delete(Slice("box"));
WriteBatchInternal::SetSequence(&batch, 200);
Slice contents = WriteBatchInternal::Contents(&batch);
WriteBatchInternal::SetContents(&batch,
Slice(contents.data(),contents.size()-1));
ASSERT_EQ("Put(foo, bar)@200"
"ParseError()",
PrintContents(&batch));
}
TEST(WriteBatchTest, Append) {
WriteBatch b1, b2;
WriteBatchInternal::SetSequence(&b1, 200);
WriteBatchInternal::SetSequence(&b2, 300);
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("",
PrintContents(&b1));
b2.Put("a", "va");
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("Put(a, va)@200",
PrintContents(&b1));
b2.Clear();
b2.Put("b", "vb");
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("Put(a, va)@200"
"Put(b, vb)@201",
PrintContents(&b1));
b2.Delete("foo");
WriteBatchInternal::Append(&b1, &b2);
ASSERT_EQ("Put(a, va)@200"
"Put(b, vb)@202"
"Put(b, vb)@201"
"Delete(foo)@203",
PrintContents(&b1));
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@@ -0,0 +1,718 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include <stdlib.h>
#include <sqlite3.h>
#include "util/histogram.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
//
// fillseq -- write N values in sequential key order in async mode
// fillseqsync -- write N/100 values in sequential key order in sync mode
// fillseqbatch -- batch write N values in sequential key order in async mode
// fillrandom -- write N values in random key order in async mode
// fillrandsync -- write N/100 values in random key order in sync mode
// fillrandbatch -- batch write N values in sequential key order in async mode
// overwrite -- overwrite N values in random key order in async mode
// fillrand100K -- write N/1000 100K values in random order in async mode
// fillseq100K -- write N/1000 100K values in sequential order in async mode
// readseq -- read N times sequentially
// readrandom -- read N times in random order
// readrand100K -- read N/1000 100K values in sequential order in async mode
static const char* FLAGS_benchmarks =
"fillseq,"
"fillseqsync,"
"fillseqbatch,"
"fillrandom,"
"fillrandsync,"
"fillrandbatch,"
"overwrite,"
"overwritebatch,"
"readrandom,"
"readseq,"
"fillrand100K,"
"fillseq100K,"
"readseq,"
"readrand100K,"
;
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;
// Size of each value
static int FLAGS_value_size = 100;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.5;
// Page size. Default 1 KB.
static int FLAGS_page_size = 1024;
// Number of pages.
// Default cache size = FLAGS_page_size * FLAGS_num_pages = 4 MB.
static int FLAGS_num_pages = 4096;
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
// If true, we allow batch writes to occur
static bool FLAGS_transaction = true;
// If true, we enable Write-Ahead Logging
static bool FLAGS_WAL_enabled = true;
// Use the db with the following name.
static const char* FLAGS_db = NULL;
inline
static void ExecErrorCheck(int status, char *err_msg) {
if (status != SQLITE_OK) {
fprintf(stderr, "SQL error: %s\n", err_msg);
sqlite3_free(err_msg);
exit(1);
}
}
inline
static void StepErrorCheck(int status) {
if (status != SQLITE_DONE) {
fprintf(stderr, "SQL step error: status = %d\n", status);
exit(1);
}
}
inline
static void ErrorCheck(int status) {
if (status != SQLITE_OK) {
fprintf(stderr, "sqlite3 error: status = %d\n", status);
exit(1);
}
}
inline
static void WalCheckpoint(sqlite3* db_) {
// Flush all writes to disk
if (FLAGS_WAL_enabled) {
sqlite3_wal_checkpoint_v2(db_, NULL, SQLITE_CHECKPOINT_FULL, NULL, NULL);
}
}
namespace leveldb {
// Helper for quickly generating random data.
namespace {
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
static Slice TrimSpace(Slice s) {
int start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
int limit = s.size();
while (limit > start && isspace(s[limit-1])) {
limit--;
}
return Slice(s.data() + start, limit - start);
}
} // namespace
class Benchmark {
private:
sqlite3* db_;
int db_num_;
int num_;
int reads_;
double start_;
double last_op_finish_;
int64_t bytes_;
std::string message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
// State kept for progress messages
int done_;
int next_report_; // When to report next
void PrintHeader() {
const int kKeySize = 16;
PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
fprintf(stdout, "Values: %d bytes each\n", FLAGS_value_size);
fprintf(stdout, "Entries: %d\n", num_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
/ 1048576.0));
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
}
void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
);
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
}
void PrintEnvironment() {
fprintf(stderr, "SQLite: version %s\n", SQLITE_VERSION);
#if defined(__linux)
time_t now = time(NULL);
fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo != NULL) {
char line[1000];
int num_cpus = 0;
std::string cpu_type;
std::string cache_size;
while (fgets(line, sizeof(line), cpuinfo) != NULL) {
const char* sep = strchr(line, ':');
if (sep == NULL) {
continue;
}
Slice key = TrimSpace(Slice(line, sep - 1 - line));
Slice val = TrimSpace(Slice(sep + 1));
if (key == "model name") {
++num_cpus;
cpu_type = val.ToString();
} else if (key == "cache size") {
cache_size = val.ToString();
}
}
fclose(cpuinfo);
fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
}
#endif
}
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
message_.clear();
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
next_report_ = 100;
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros() * 1e-6;
double micros = (now - last_op_finish_) * 1e6;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void Stop(const Slice& name) {
double finish = Env::Default()->NowMicros() * 1e-6;
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedSingleOp().
if (done_ < 1) done_ = 1;
if (bytes_ > 0) {
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_ = std::string(rate) + " " + message_;
} else {
message_ = rate;
}
}
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
(finish - start_) * 1e6 / done_,
(message_.empty() ? "" : " "),
message_.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
public:
enum Order {
SEQUENTIAL,
RANDOM
};
enum DBState {
FRESH,
EXISTING
};
Benchmark()
: db_(NULL),
db_num_(0),
num_(FLAGS_num),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
bytes_(0),
rand_(301) {
std::vector<std::string> files;
std::string test_dir;
Env::Default()->GetTestDirectory(&test_dir);
Env::Default()->GetChildren(test_dir, &files);
if (!FLAGS_use_existing_db) {
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("dbbench_sqlite3")) {
std::string file_name(test_dir);
file_name += "/";
file_name += files[i];
Env::Default()->DeleteFile(file_name.c_str());
}
}
}
}
~Benchmark() {
int status = sqlite3_close(db_);
ErrorCheck(status);
}
void Run() {
PrintHeader();
Open();
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
bytes_ = 0;
Start();
bool known = true;
bool write_sync = false;
if (name == Slice("fillseq")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillseqbatch")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1000);
WalCheckpoint(db_);
} else if (name == Slice("fillrandom")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillrandbatch")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1000);
WalCheckpoint(db_);
} else if (name == Slice("overwrite")) {
Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("overwritebatch")) {
Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1000);
WalCheckpoint(db_);
} else if (name == Slice("fillrandsync")) {
write_sync = true;
Write(write_sync, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillseqsync")) {
write_sync = true;
Write(write_sync, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillrand100K")) {
Write(write_sync, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
WalCheckpoint(db_);
} else if (name == Slice("fillseq100K")) {
Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1);
WalCheckpoint(db_);
} else if (name == Slice("readseq")) {
ReadSequential();
} else if (name == Slice("readrandom")) {
Read(RANDOM, 1);
} else if (name == Slice("readrand100K")) {
int n = reads_;
reads_ /= 1000;
Read(RANDOM, 1);
reads_ = n;
} else {
known = false;
if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
}
if (known) {
Stop(name);
}
}
}
void Open() {
assert(db_ == NULL);
int status;
char file_name[100];
char* err_msg = NULL;
db_num_++;
// Open database
std::string tmp_dir;
Env::Default()->GetTestDirectory(&tmp_dir);
snprintf(file_name, sizeof(file_name),
"%s/dbbench_sqlite3-%d.db",
tmp_dir.c_str(),
db_num_);
status = sqlite3_open(file_name, &db_);
if (status) {
fprintf(stderr, "open error: %s\n", sqlite3_errmsg(db_));
exit(1);
}
// Change SQLite cache size
char cache_size[100];
snprintf(cache_size, sizeof(cache_size), "PRAGMA cache_size = %d",
FLAGS_num_pages);
status = sqlite3_exec(db_, cache_size, NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
// FLAGS_page_size is defaulted to 1024
if (FLAGS_page_size != 1024) {
char page_size[100];
snprintf(page_size, sizeof(page_size), "PRAGMA page_size = %d",
FLAGS_page_size);
status = sqlite3_exec(db_, page_size, NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
}
// Change journal mode to WAL if WAL enabled flag is on
if (FLAGS_WAL_enabled) {
std::string WAL_stmt = "PRAGMA journal_mode = WAL";
// LevelDB's default cache size is a combined 4 MB
std::string WAL_checkpoint = "PRAGMA wal_autocheckpoint = 4096";
status = sqlite3_exec(db_, WAL_stmt.c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
status = sqlite3_exec(db_, WAL_checkpoint.c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
}
// Change locking mode to exclusive and create tables/index for database
std::string locking_stmt = "PRAGMA locking_mode = EXCLUSIVE";
std::string create_stmt =
"CREATE TABLE test (key blob, value blob, PRIMARY KEY(key))";
std::string stmt_array[] = { locking_stmt, create_stmt };
int stmt_array_length = sizeof(stmt_array) / sizeof(std::string);
for (int i = 0; i < stmt_array_length; i++) {
status = sqlite3_exec(db_, stmt_array[i].c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
}
}
void Write(bool write_sync, Order order, DBState state,
int num_entries, int value_size, int entries_per_batch) {
// Create new database if state == FRESH
if (state == FRESH) {
if (FLAGS_use_existing_db) {
message_ = "skipping (--use_existing_db is true)";
return;
}
sqlite3_close(db_);
db_ = NULL;
Open();
Start();
}
if (num_entries != num_) {
char msg[100];
snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
message_ = msg;
}
char* err_msg = NULL;
int status;
sqlite3_stmt *replace_stmt, *begin_trans_stmt, *end_trans_stmt;
std::string replace_str = "REPLACE INTO test (key, value) VALUES (?, ?)";
std::string begin_trans_str = "BEGIN TRANSACTION;";
std::string end_trans_str = "END TRANSACTION;";
// Check for synchronous flag in options
std::string sync_stmt = (write_sync) ? "PRAGMA synchronous = FULL" :
"PRAGMA synchronous = OFF";
status = sqlite3_exec(db_, sync_stmt.c_str(), NULL, NULL, &err_msg);
ExecErrorCheck(status, err_msg);
// Preparing sqlite3 statements
status = sqlite3_prepare_v2(db_, replace_str.c_str(), -1,
&replace_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, begin_trans_str.c_str(), -1,
&begin_trans_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, end_trans_str.c_str(), -1,
&end_trans_stmt, NULL);
ErrorCheck(status);
bool transaction = (entries_per_batch > 1);
for (int i = 0; i < num_entries; i += entries_per_batch) {
// Begin write transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(begin_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(begin_trans_stmt);
ErrorCheck(status);
}
// Create and execute SQL statements
for (int j = 0; j < entries_per_batch; j++) {
const char* value = gen_.Generate(value_size).data();
// Create values for key-value pair
const int k = (order == SEQUENTIAL) ? i + j :
(rand_.Next() % num_entries);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
// Bind KV values into replace_stmt
status = sqlite3_bind_blob(replace_stmt, 1, key, 16, SQLITE_STATIC);
ErrorCheck(status);
status = sqlite3_bind_blob(replace_stmt, 2, value,
value_size, SQLITE_STATIC);
ErrorCheck(status);
// Execute replace_stmt
bytes_ += value_size + strlen(key);
status = sqlite3_step(replace_stmt);
StepErrorCheck(status);
// Reset SQLite statement for another use
status = sqlite3_clear_bindings(replace_stmt);
ErrorCheck(status);
status = sqlite3_reset(replace_stmt);
ErrorCheck(status);
FinishedSingleOp();
}
// End write transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(end_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(end_trans_stmt);
ErrorCheck(status);
}
}
status = sqlite3_finalize(replace_stmt);
ErrorCheck(status);
status = sqlite3_finalize(begin_trans_stmt);
ErrorCheck(status);
status = sqlite3_finalize(end_trans_stmt);
ErrorCheck(status);
}
void Read(Order order, int entries_per_batch) {
int status;
sqlite3_stmt *read_stmt, *begin_trans_stmt, *end_trans_stmt;
std::string read_str = "SELECT * FROM test WHERE key = ?";
std::string begin_trans_str = "BEGIN TRANSACTION;";
std::string end_trans_str = "END TRANSACTION;";
// Preparing sqlite3 statements
status = sqlite3_prepare_v2(db_, begin_trans_str.c_str(), -1,
&begin_trans_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, end_trans_str.c_str(), -1,
&end_trans_stmt, NULL);
ErrorCheck(status);
status = sqlite3_prepare_v2(db_, read_str.c_str(), -1, &read_stmt, NULL);
ErrorCheck(status);
bool transaction = (entries_per_batch > 1);
for (int i = 0; i < reads_; i += entries_per_batch) {
// Begin read transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(begin_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(begin_trans_stmt);
ErrorCheck(status);
}
// Create and execute SQL statements
for (int j = 0; j < entries_per_batch; j++) {
// Create key value
char key[100];
int k = (order == SEQUENTIAL) ? i + j : (rand_.Next() % reads_);
snprintf(key, sizeof(key), "%016d", k);
// Bind key value into read_stmt
status = sqlite3_bind_blob(read_stmt, 1, key, 16, SQLITE_STATIC);
ErrorCheck(status);
// Execute read statement
while ((status = sqlite3_step(read_stmt)) == SQLITE_ROW) {}
StepErrorCheck(status);
// Reset SQLite statement for another use
status = sqlite3_clear_bindings(read_stmt);
ErrorCheck(status);
status = sqlite3_reset(read_stmt);
ErrorCheck(status);
FinishedSingleOp();
}
// End read transaction
if (FLAGS_transaction && transaction) {
status = sqlite3_step(end_trans_stmt);
StepErrorCheck(status);
status = sqlite3_reset(end_trans_stmt);
ErrorCheck(status);
}
}
status = sqlite3_finalize(read_stmt);
ErrorCheck(status);
status = sqlite3_finalize(begin_trans_stmt);
ErrorCheck(status);
status = sqlite3_finalize(end_trans_stmt);
ErrorCheck(status);
}
void ReadSequential() {
int status;
sqlite3_stmt *pStmt;
std::string read_str = "SELECT * FROM test ORDER BY key";
status = sqlite3_prepare_v2(db_, read_str.c_str(), -1, &pStmt, NULL);
ErrorCheck(status);
for (int i = 0; i < reads_ && SQLITE_ROW == sqlite3_step(pStmt); i++) {
bytes_ += sqlite3_column_bytes(pStmt, 1) + sqlite3_column_bytes(pStmt, 2);
FinishedSingleOp();
}
status = sqlite3_finalize(pStmt);
ErrorCheck(status);
}
};
} // namespace leveldb
int main(int argc, char** argv) {
std::string default_db_path;
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_existing_db = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (leveldb::Slice(argv[i]) == leveldb::Slice("--no_transaction")) {
FLAGS_transaction = false;
} else if (sscanf(argv[i], "--page_size=%d%c", &n, &junk) == 1) {
FLAGS_page_size = n;
} else if (sscanf(argv[i], "--num_pages=%d%c", &n, &junk) == 1) {
FLAGS_num_pages = n;
} else if (sscanf(argv[i], "--WAL_enabled=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_WAL_enabled = n;
} else if (strncmp(argv[i], "--db=", 5) == 0) {
FLAGS_db = argv[i] + 5;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
// Choose a location for the test database if none given with --db=<path>
if (FLAGS_db == NULL) {
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
default_db_path += "/dbbench";
FLAGS_db = default_db_path.c_str();
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}

View File

@@ -0,0 +1,528 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <stdio.h>
#include <stdlib.h>
#include <kcpolydb.h>
#include "util/histogram.h"
#include "util/random.h"
#include "util/testutil.h"
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
//
// fillseq -- write N values in sequential key order in async mode
// fillrandom -- write N values in random key order in async mode
// overwrite -- overwrite N values in random key order in async mode
// fillseqsync -- write N/100 values in sequential key order in sync mode
// fillrandsync -- write N/100 values in random key order in sync mode
// fillrand100K -- write N/1000 100K values in random order in async mode
// fillseq100K -- write N/1000 100K values in seq order in async mode
// readseq -- read N times sequentially
// readseq100K -- read N/1000 100K values in sequential order in async mode
// readrand100K -- read N/1000 100K values in sequential order in async mode
// readrandom -- read N times in random order
static const char* FLAGS_benchmarks =
"fillseq,"
"fillseqsync,"
"fillrandsync,"
"fillrandom,"
"overwrite,"
"readrandom,"
"readseq,"
"fillrand100K,"
"fillseq100K,"
"readseq100K,"
"readrand100K,"
;
// Number of key/values to place in database
static int FLAGS_num = 1000000;
// Number of read operations to do. If negative, do FLAGS_num reads.
static int FLAGS_reads = -1;
// Size of each value
static int FLAGS_value_size = 100;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
static double FLAGS_compression_ratio = 0.5;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
// Cache size. Default 4 MB
static int FLAGS_cache_size = 4194304;
// Page size. Default 1 KB
static int FLAGS_page_size = 1024;
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
// Compression flag. If true, compression is on. If false, compression
// is off.
static bool FLAGS_compression = true;
// Use the db with the following name.
static const char* FLAGS_db = NULL;
inline
static void DBSynchronize(kyotocabinet::TreeDB* db_)
{
// Synchronize will flush writes to disk
if (!db_->synchronize()) {
fprintf(stderr, "synchronize error: %s\n", db_->error().name());
}
}
namespace leveldb {
// Helper for quickly generating random data.
namespace {
class RandomGenerator {
private:
std::string data_;
int pos_;
public:
RandomGenerator() {
// We use a limited amount of data over and over again and ensure
// that it is larger than the compression window (32KB), and also
// large enough to serve all typical value sizes we want to write.
Random rnd(301);
std::string piece;
while (data_.size() < 1048576) {
// Add a short fragment that is as compressible as specified
// by FLAGS_compression_ratio.
test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
data_.append(piece);
}
pos_ = 0;
}
Slice Generate(int len) {
if (pos_ + len > data_.size()) {
pos_ = 0;
assert(len < data_.size());
}
pos_ += len;
return Slice(data_.data() + pos_ - len, len);
}
};
static Slice TrimSpace(Slice s) {
int start = 0;
while (start < s.size() && isspace(s[start])) {
start++;
}
int limit = s.size();
while (limit > start && isspace(s[limit-1])) {
limit--;
}
return Slice(s.data() + start, limit - start);
}
} // namespace
class Benchmark {
private:
kyotocabinet::TreeDB* db_;
int db_num_;
int num_;
int reads_;
double start_;
double last_op_finish_;
int64_t bytes_;
std::string message_;
Histogram hist_;
RandomGenerator gen_;
Random rand_;
kyotocabinet::LZOCompressor<kyotocabinet::LZO::RAW> comp_;
// State kept for progress messages
int done_;
int next_report_; // When to report next
void PrintHeader() {
const int kKeySize = 16;
PrintEnvironment();
fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
FLAGS_value_size,
static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
fprintf(stdout, "Entries: %d\n", num_);
fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
/ 1048576.0));
fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
(((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
/ 1048576.0));
PrintWarnings();
fprintf(stdout, "------------------------------------------------\n");
}
void PrintWarnings() {
#if defined(__GNUC__) && !defined(__OPTIMIZE__)
fprintf(stdout,
"WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
);
#endif
#ifndef NDEBUG
fprintf(stdout,
"WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
}
void PrintEnvironment() {
fprintf(stderr, "Kyoto Cabinet: version %s, lib ver %d, lib rev %d\n",
kyotocabinet::VERSION, kyotocabinet::LIBVER, kyotocabinet::LIBREV);
#if defined(__linux)
time_t now = time(NULL);
fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo != NULL) {
char line[1000];
int num_cpus = 0;
std::string cpu_type;
std::string cache_size;
while (fgets(line, sizeof(line), cpuinfo) != NULL) {
const char* sep = strchr(line, ':');
if (sep == NULL) {
continue;
}
Slice key = TrimSpace(Slice(line, sep - 1 - line));
Slice val = TrimSpace(Slice(sep + 1));
if (key == "model name") {
++num_cpus;
cpu_type = val.ToString();
} else if (key == "cache size") {
cache_size = val.ToString();
}
}
fclose(cpuinfo);
fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
}
#endif
}
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
message_.clear();
last_op_finish_ = start_;
hist_.Clear();
done_ = 0;
next_report_ = 100;
}
void FinishedSingleOp() {
if (FLAGS_histogram) {
double now = Env::Default()->NowMicros() * 1e-6;
double micros = (now - last_op_finish_) * 1e6;
hist_.Add(micros);
if (micros > 20000) {
fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
fflush(stderr);
}
last_op_finish_ = now;
}
done_++;
if (done_ >= next_report_) {
if (next_report_ < 1000) next_report_ += 100;
else if (next_report_ < 5000) next_report_ += 500;
else if (next_report_ < 10000) next_report_ += 1000;
else if (next_report_ < 50000) next_report_ += 5000;
else if (next_report_ < 100000) next_report_ += 10000;
else if (next_report_ < 500000) next_report_ += 50000;
else next_report_ += 100000;
fprintf(stderr, "... finished %d ops%30s\r", done_, "");
fflush(stderr);
}
}
void Stop(const Slice& name) {
double finish = Env::Default()->NowMicros() * 1e-6;
// Pretend at least one op was done in case we are running a benchmark
// that does not call FinishedSingleOp().
if (done_ < 1) done_ = 1;
if (bytes_ > 0) {
char rate[100];
snprintf(rate, sizeof(rate), "%6.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
message_ = std::string(rate) + " " + message_;
} else {
message_ = rate;
}
}
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
name.ToString().c_str(),
(finish - start_) * 1e6 / done_,
(message_.empty() ? "" : " "),
message_.c_str());
if (FLAGS_histogram) {
fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
}
fflush(stdout);
}
public:
enum Order {
SEQUENTIAL,
RANDOM
};
enum DBState {
FRESH,
EXISTING
};
Benchmark()
: db_(NULL),
num_(FLAGS_num),
reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
bytes_(0),
rand_(301) {
std::vector<std::string> files;
std::string test_dir;
Env::Default()->GetTestDirectory(&test_dir);
Env::Default()->GetChildren(test_dir.c_str(), &files);
if (!FLAGS_use_existing_db) {
for (int i = 0; i < files.size(); i++) {
if (Slice(files[i]).starts_with("dbbench_polyDB")) {
std::string file_name(test_dir);
file_name += "/";
file_name += files[i];
Env::Default()->DeleteFile(file_name.c_str());
}
}
}
}
~Benchmark() {
if (!db_->close()) {
fprintf(stderr, "close error: %s\n", db_->error().name());
}
}
void Run() {
PrintHeader();
Open(false);
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
const char* sep = strchr(benchmarks, ',');
Slice name;
if (sep == NULL) {
name = benchmarks;
benchmarks = NULL;
} else {
name = Slice(benchmarks, sep - benchmarks);
benchmarks = sep + 1;
}
Start();
bool known = true;
bool write_sync = false;
if (name == Slice("fillseq")) {
Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1);
} else if (name == Slice("fillrandom")) {
Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1);
DBSynchronize(db_);
} else if (name == Slice("overwrite")) {
Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1);
DBSynchronize(db_);
} else if (name == Slice("fillrandsync")) {
write_sync = true;
Write(write_sync, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1);
DBSynchronize(db_);
} else if (name == Slice("fillseqsync")) {
write_sync = true;
Write(write_sync, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1);
DBSynchronize(db_);
} else if (name == Slice("fillrand100K")) {
Write(write_sync, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1);
DBSynchronize(db_);
} else if (name == Slice("fillseq100K")) {
Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1);
DBSynchronize(db_);
} else if (name == Slice("readseq")) {
ReadSequential();
} else if (name == Slice("readrandom")) {
ReadRandom();
} else if (name == Slice("readrand100K")) {
int n = reads_;
reads_ /= 1000;
ReadRandom();
reads_ = n;
} else if (name == Slice("readseq100K")) {
int n = reads_;
reads_ /= 1000;
ReadSequential();
reads_ = n;
} else {
known = false;
if (name != Slice()) { // No error message for empty name
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
}
if (known) {
Stop(name);
}
}
}
private:
void Open(bool sync) {
assert(db_ == NULL);
// Initialize db_
db_ = new kyotocabinet::TreeDB();
char file_name[100];
db_num_++;
std::string test_dir;
Env::Default()->GetTestDirectory(&test_dir);
snprintf(file_name, sizeof(file_name),
"%s/dbbench_polyDB-%d.kct",
test_dir.c_str(),
db_num_);
// Create tuning options and open the database
int open_options = kyotocabinet::PolyDB::OWRITER |
kyotocabinet::PolyDB::OCREATE;
int tune_options = kyotocabinet::TreeDB::TSMALL |
kyotocabinet::TreeDB::TLINEAR;
if (FLAGS_compression) {
tune_options |= kyotocabinet::TreeDB::TCOMPRESS;
db_->tune_compressor(&comp_);
}
db_->tune_options(tune_options);
db_->tune_page_cache(FLAGS_cache_size);
db_->tune_page(FLAGS_page_size);
db_->tune_map(256LL<<20);
if (sync) {
open_options |= kyotocabinet::PolyDB::OAUTOSYNC;
}
if (!db_->open(file_name, open_options)) {
fprintf(stderr, "open error: %s\n", db_->error().name());
}
}
void Write(bool sync, Order order, DBState state,
int num_entries, int value_size, int entries_per_batch) {
// Create new database if state == FRESH
if (state == FRESH) {
if (FLAGS_use_existing_db) {
message_ = "skipping (--use_existing_db is true)";
return;
}
delete db_;
db_ = NULL;
Open(sync);
Start(); // Do not count time taken to destroy/open
}
if (num_entries != num_) {
char msg[100];
snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
message_ = msg;
}
// Write to database
for (int i = 0; i < num_entries; i++)
{
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % num_entries);
char key[100];
snprintf(key, sizeof(key), "%016d", k);
bytes_ += value_size + strlen(key);
std::string cpp_key = key;
if (!db_->set(cpp_key, gen_.Generate(value_size).ToString())) {
fprintf(stderr, "set error: %s\n", db_->error().name());
}
FinishedSingleOp();
}
}
void ReadSequential() {
kyotocabinet::DB::Cursor* cur = db_->cursor();
cur->jump();
std::string ckey, cvalue;
while (cur->get(&ckey, &cvalue, true)) {
bytes_ += ckey.size() + cvalue.size();
FinishedSingleOp();
}
delete cur;
}
void ReadRandom() {
std::string value;
for (int i = 0; i < reads_; i++) {
char key[100];
const int k = rand_.Next() % reads_;
snprintf(key, sizeof(key), "%016d", k);
db_->get(key, &value);
FinishedSingleOp();
}
}
};
} // namespace leveldb
int main(int argc, char** argv) {
std::string default_db_path;
for (int i = 1; i < argc; i++) {
double d;
int n;
char junk;
if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
} else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
FLAGS_compression_ratio = d;
} else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_histogram = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
FLAGS_reads = n;
} else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
FLAGS_value_size = n;
} else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
FLAGS_cache_size = n;
} else if (sscanf(argv[i], "--page_size=%d%c", &n, &junk) == 1) {
FLAGS_page_size = n;
} else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_compression = (n == 1) ? true : false;
} else if (strncmp(argv[i], "--db=", 5) == 0) {
FLAGS_db = argv[i] + 5;
} else {
fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
exit(1);
}
}
// Choose a location for the test database if none given with --db=<path>
if (FLAGS_db == NULL) {
leveldb::Env::Default()->GetTestDirectory(&default_db_path);
default_db_path += "/dbbench";
FLAGS_db = default_db_path.c_str();
}
leveldb::Benchmark benchmark;
benchmark.Run();
return 0;
}

View File

@@ -0,0 +1,459 @@
<!DOCTYPE html>
<html>
<head>
<title>LevelDB Benchmarks</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<style>
body {
font-family:Helvetica,sans-serif;
padding:20px;
}
h2 {
padding-top:30px;
}
table.bn {
width:800px;
border-collapse:collapse;
border:0;
padding:0;
}
table.bnbase {
width:650px;
}
table.bn td {
padding:2px 0;
}
table.bn td.c1 {
font-weight:bold;
width:150px;
}
table.bn td.c1 div.e {
float:right;
font-weight:normal;
}
table.bn td.c2 {
width:150px;
text-align:right;
padding:2px;
}
table.bn td.c3 {
width:350px;
}
table.bn td.c4 {
width:150px;
font-size:small;
padding-left:4px;
}
/* chart bars */
div.bldb {
background-color:#0255df;
}
div.bkct {
background-color:#df5555;
}
div.bsql {
background-color:#aadf55;
}
.code {
font-family:monospace;
font-size:large;
}
.todo {
color: red;
}
</style>
</head>
<body>
<h1>LevelDB Benchmarks</h1>
<p>Google, July 2011</p>
<hr>
<p>In order to test LevelDB's performance, we benchmark it against other well-established database implementations. We compare LevelDB (revision 39) against <a href="http://www.sqlite.org/">SQLite3</a> (version 3.7.6.3) and <a href="http://fallabs.com/kyotocabinet/spex.html">Kyoto Cabinet's</a> (version 1.2.67) TreeDB (a B+Tree based key-value store). We would like to acknowledge Scott Hess and Mikio Hirabayashi for their suggestions and contributions to the SQLite3 and Kyoto Cabinet benchmarks, respectively.</p>
<p>Benchmarks were all performed on a six-core Intel(R) Xeon(R) CPU X5650 @ 2.67GHz, with 12288 KB of total L3 cache and 12 GB of DDR3 RAM at 1333 MHz. (Note that LevelDB uses at most two CPUs since the benchmarks are single threaded: one to run the benchmark, and one for background compactions.) We ran the benchmarks on two machines (with identical processors), one with an Ext3 file system and one with an Ext4 file system. The machine with the Ext3 file system has a SATA Hitachi HDS721050CLA362 hard drive. The machine with the Ext4 file system has a SATA Samsung HD502HJ hard drive. Both hard drives spin at 7200 RPM and have hard drive write-caching enabled (using `hdparm -W 1 [device]`). The numbers reported below are the median of three measurements.</p>
<h4>Benchmark Source Code</h4>
<p>We wrote benchmark tools for SQLite and Kyoto TreeDB based on LevelDB's <span class="code">db_bench</span>. The code for each of the benchmarks resides here:</p>
<ul>
<li> <b>LevelDB:</b> <a href="http://code.google.com/p/leveldb/source/browse/trunk/db/db_bench.cc">db/db_bench.cc</a>.</li>
<li> <b>SQLite:</b> <a href="http://code.google.com/p/leveldb/source/browse/#svn%2Ftrunk%2Fdoc%2Fbench%2Fdb_bench_sqlite3.cc">doc/bench/db_bench_sqlite3.cc</a>.</li>
<li> <b>Kyoto TreeDB:</b> <a href="http://code.google.com/p/leveldb/source/browse/#svn%2Ftrunk%2Fdoc%2Fbench%2Fdb_bench_tree_db.cc">doc/bench/db_bench_tree_db.cc</a>.</li>
</ul>
<h4>Custom Build Specifications</h4>
<ul>
<li>LevelDB: LevelDB was compiled with the <a href="http://code.google.com/p/google-perftools">tcmalloc</a> library and the <a href="http://code.google.com/p/snappy/">Snappy</a> compression library (revision 33). Assertions were disabled.</li>
<li>TreeDB: TreeDB was compiled using the <a href="http://www.oberhumer.com/opensource/lzo/">LZO</a> compression library (version 2.03). Furthermore, we enabled the TSMALL and TLINEAR options when opening the database in order to reduce the footprint of each record.</li>
<li>SQLite: We tuned SQLite's performance, by setting its locking mode to exclusive. We also enabled SQLite's <a href="http://www.sqlite.org/draft/wal.html">write-ahead logging</a>.</li>
</ul>
<h2>1. Baseline Performance</h2>
<p>This section gives the baseline performance of all the
databases. Following sections show how performance changes as various
parameters are varied. For the baseline:</p>
<ul>
<li> Each database is allowed 4 MB of cache memory.</li>
<li> Databases are opened in <em>asynchronous</em> write mode.
(LevelDB's sync option, TreeDB's OAUTOSYNC option, and
SQLite3's synchronous options are all turned off). I.e.,
every write is pushed to the operating system, but the
benchmark does not wait for the write to reach the disk.</li>
<li> Keys are 16 bytes each.</li>
<li> Value are 100 bytes each (with enough redundancy so that
a simple compressor shrinks them to 50% of their original
size).</li>
<li> Sequential reads/writes traverse the key space in increasing order.</li>
<li> Random reads/writes traverse the key space in random order.</li>
</ul>
<h3>A. Sequential Reads</h3>
<table class="bn bnbase">
<tr><td class="c1">LevelDB</td>
<td class="c2">4,030,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">1,010,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:95px">&nbsp;</div></td>
<tr><td class="c1">SQLite3</td>
<td class="c2">383,000 ops/sec</td>
<td class="c3"><div class="bsql" style="width:33px">&nbsp;</div></td>
</table>
<h3>B. Random Reads</h3>
<table class="bn bnbase">
<tr><td class="c1">LevelDB</td>
<td class="c2">129,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:298px">&nbsp;</div></td>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">151,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:350px">&nbsp;</div></td>
<tr><td class="c1">SQLite3</td>
<td class="c2">134,000 ops/sec</td>
<td class="c3"><div class="bsql" style="width:310px">&nbsp;</div></td>
</table>
<h3>C. Sequential Writes</h3>
<table class="bn bnbase">
<tr><td class="c1">LevelDB</td>
<td class="c2">779,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">342,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:154px">&nbsp;</div></td>
<tr><td class="c1">SQLite3</td>
<td class="c2">48,600 ops/sec</td>
<td class="c3"><div class="bsql" style="width:22px">&nbsp;</div></td>
</table>
<h3>D. Random Writes</h3>
<table class="bn bnbase">
<tr><td class="c1">LevelDB</td>
<td class="c2">164,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">88,500 ops/sec</td>
<td class="c3"><div class="bkct" style="width:188px">&nbsp;</div></td>
<tr><td class="c1">SQLite3</td>
<td class="c2">9,860 ops/sec</td>
<td class="c3"><div class="bsql" style="width:21px">&nbsp;</div></td>
</table>
<p>LevelDB outperforms both SQLite3 and TreeDB in sequential and random write operations and sequential read operations. Kyoto Cabinet has the fastest random read operations.</p>
<h2>2. Write Performance under Different Configurations</h2>
<h3>A. Large Values </h3>
<p>For this benchmark, we start with an empty database, and write 100,000 byte values (~50% compressible). To keep the benchmark running time reasonable, we stop after writing 1000 values.</p>
<h4>Sequential Writes</h4>
<table class="bn bnbase">
<tr><td class="c1">LevelDB</td>
<td class="c2">1,100 ops/sec</td>
<td class="c3"><div class="bldb" style="width:234px">&nbsp;</div></td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">1,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:224px">&nbsp;</div></td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">1,600 ops/sec</td>
<td class="c3"><div class="bsql" style="width:350px">&nbsp;</div></td></tr>
</table>
<h4>Random Writes</h4>
<table class="bn bnbase">
<tr><td class="c1">LevelDB</td>
<td class="c2">480 ops/sec</td>
<td class="c3"><div class="bldb" style="width:105px">&nbsp;</div></td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">1,100 ops/sec</td>
<td class="c3"><div class="bkct" style="width:240px">&nbsp;</div></td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">1,600 ops/sec</td>
<td class="c3"><div class="bsql" style="width:350px">&nbsp;</div></td></tr>
</table>
<p>LevelDB doesn't perform as well with large values of 100,000 bytes each. This is because LevelDB writes keys and values at least twice: first time to the transaction log, and second time (during a compaction) to a sorted file.
With larger values, LevelDB's per-operation efficiency is swamped by the
cost of extra copies of large values.</p>
<h3>B. Batch Writes</h3>
<p>A batch write is a set of writes that are applied atomically to the underlying database. A single batch of N writes may be significantly faster than N individual writes. The following benchmark writes one thousand batches where each batch contains one thousand 100-byte values. TreeDB does not support batch writes and is omitted from this benchmark.</p>
<h4>Sequential Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">840,000 entries/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.08x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">124,000 entries/sec</td>
<td class="c3"><div class="bsql" style="width:52px">&nbsp;</div></td>
<td class="c4">(2.55x baseline)</td></tr>
</table>
<h4>Random Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">221,000 entries/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.35x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">22,000 entries/sec</td>
<td class="c3"><div class="bsql" style="width:34px">&nbsp;</div></td>
<td class="c4">(2.23x baseline)</td></tr>
</table>
<p>Because of the way LevelDB persistent storage is organized, batches of
random writes are not much slower (only a factor of 4x) than batches
of sequential writes.</p>
<h3>C. Synchronous Writes</h3>
<p>In the following benchmark, we enable the synchronous writing modes
of all of the databases. Since this change significantly slows down the
benchmark, we stop after 10,000 writes. For synchronous write tests, we've
disabled hard drive write-caching (using `hdparm -W 0 [device]`).</p>
<ul>
<li>For LevelDB, we set WriteOptions.sync = true.</li>
<li>In TreeDB, we enabled TreeDB's OAUTOSYNC option.</li>
<li>For SQLite3, we set "PRAGMA synchronous = FULL".</li>
</ul>
<h4>Sequential Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">100 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(0.003x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">7 ops/sec</td>
<td class="c3"><div class="bkct" style="width:27px">&nbsp;</div></td>
<td class="c4">(0.0004x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">88 ops/sec</td>
<td class="c3"><div class="bsql" style="width:315px">&nbsp;</div></td>
<td class="c4">(0.002x baseline)</td></tr>
</table>
<h4>Random Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">100 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(0.015x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">8 ops/sec</td>
<td class="c3"><div class="bkct" style="width:29px">&nbsp;</div></td>
<td class="c4">(0.001x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">88 ops/sec</td>
<td class="c3"><div class="bsql" style="width:314px">&nbsp;</div></td>
<td class="c4">(0.009x baseline)</td></tr>
</table>
<p>Also see the <code>ext4</code> performance numbers below
since synchronous writes behave significantly differently
on <code>ext3</code> and <code>ext4</code>.</p>
<h3>D. Turning Compression Off</h3>
<p>In the baseline measurements, LevelDB and TreeDB were using
light-weight compression
(<a href="http://code.google.com/p/snappy/">Snappy</a> for LevelDB,
and <a href="http://www.oberhumer.com/opensource/lzo/">LZO</a> for
TreeDB). SQLite3, by default does not use compression. The
experiments below show what happens when compression is disabled in
all of the databases (the SQLite3 numbers are just a copy of
its baseline measurements):</p>
<h4>Sequential Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">594,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(0.76x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">485,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:239px">&nbsp;</div></td>
<td class="c4">(1.42x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">48,600 ops/sec</td>
<td class="c3"><div class="bsql" style="width:29px">&nbsp;</div></td>
<td class="c4">(1.00x baseline)</td></tr>
</table>
<h4>Random Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">135,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:296px">&nbsp;</div></td>
<td class="c4">(0.82x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">159,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.80x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">9,860 ops/sec</td>
<td class="c3"><div class="bsql" style="width:22px">&nbsp;</div></td>
<td class="c4">(1.00x baseline)</td></tr>
</table>
<p>LevelDB's write performance is better with compression than without
since compression decreases the amount of data that has to be written
to disk. Therefore LevelDB users can leave compression enabled in
most scenarios without having worry about a tradeoff between space
usage and performance. TreeDB's performance on the other hand is
better without compression than with compression. Presumably this is
because TreeDB's compression library (LZO) is more expensive than
LevelDB's compression library (Snappy).<p>
<h3>E. Using More Memory</h3>
<p>We increased the overall cache size for each database to 128 MB. For LevelDB, we partitioned 128 MB into a 120 MB write buffer and 8 MB of cache (up from 2 MB of write buffer and 2 MB of cache). For SQLite3, we kept the page size at 1024 bytes, but increased the number of pages to 131,072 (up from 4096). For TreeDB, we also kept the page size at 1024 bytes, but increased the cache size to 128 MB (up from 4 MB).</p>
<h4>Sequential Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">812,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.04x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">321,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:138px">&nbsp;</div></td>
<td class="c4">(0.94x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">48,500 ops/sec</td>
<td class="c3"><div class="bsql" style="width:21px">&nbsp;</div></td>
<td class="c4">(1.00x baseline)</td></tr>
</table>
<h4>Random Writes</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">355,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(2.16x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">284,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:280px">&nbsp;</div></td>
<td class="c4">(3.21x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">9,670 ops/sec</td>
<td class="c3"><div class="bsql" style="width:10px">&nbsp;</div></td>
<td class="c4">(0.98x baseline)</td></tr>
</table>
<p>SQLite's performance does not change substantially when compared to
the baseline, but the random write performance for both LevelDB and
TreeDB increases significantly. LevelDB's performance improves
because a larger write buffer reduces the need to merge sorted files
(since it creates a smaller number of larger sorted files). TreeDB's
performance goes up because the entire database is available in memory
for fast in-place updates.</p>
<h2>3. Read Performance under Different Configurations</h2>
<h3>A. Larger Caches</h3>
<p>We increased the overall memory usage to 128 MB for each database.
For LevelDB, we allocated 8 MB to LevelDB's write buffer and 120 MB
to LevelDB's cache. The other databases don't differentiate between a
write buffer and a cache, so we simply set their cache size to 128
MB.</p>
<h4>Sequential Reads</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">5,210,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.29x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">1,070,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:72px">&nbsp;</div></td>
<td class="c4">(1.06x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">609,000 ops/sec</td>
<td class="c3"><div class="bsql" style="width:41px">&nbsp;</div></td>
<td class="c4">(1.59x baseline)</td></tr>
</table>
<h4>Random Reads</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">190,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:144px">&nbsp;</div></td>
<td class="c4">(1.47x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">463,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:350px">&nbsp;</div></td>
<td class="c4">(3.07x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">186,000 ops/sec</td>
<td class="c3"><div class="bsql" style="width:141px">&nbsp;</div></td>
<td class="c4">(1.39x baseline)</td></tr>
</table>
<p>As expected, the read performance of all of the databases increases
when the caches are enlarged. In particular, TreeDB seems to make
very effective use of a cache that is large enough to hold the entire
database.</p>
<h3>B. No Compression Reads </h3>
<p>For this benchmark, we populated a database with 1 million entries consisting of 16 byte keys and 100 byte values. We compiled LevelDB and Kyoto Cabinet without compression support, so results that are read out from the database are already uncompressed. We've listed the SQLite3 baseline read performance as a point of comparison.</p>
<h4>Sequential Reads</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">4,880,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.21x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">1,230,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:88px">&nbsp;</div></td>
<td class="c4">(3.60x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">383,000 ops/sec</td>
<td class="c3"><div class="bsql" style="width:27px">&nbsp;</div></td>
<td class="c4">(1.00x baseline)</td></tr>
</table>
<h4>Random Reads</h4>
<table class="bn">
<tr><td class="c1">LevelDB</td>
<td class="c2">149,000 ops/sec</td>
<td class="c3"><div class="bldb" style="width:300px">&nbsp;</div></td>
<td class="c4">(1.16x baseline)</td></tr>
<tr><td class="c1">Kyoto TreeDB</td>
<td class="c2">175,000 ops/sec</td>
<td class="c3"><div class="bkct" style="width:350px">&nbsp;</div></td>
<td class="c4">(1.16x baseline)</td></tr>
<tr><td class="c1">SQLite3</td>
<td class="c2">134,000 ops/sec</td>
<td class="c3"><div class="bsql" style="width:268px">&nbsp;</div></td>
<td class="c4">(1.00x baseline)</td></tr>
</table>
<p>Performance of both LevelDB and TreeDB improves a small amount when
compression is disabled. Note however that under different workloads,
performance may very well be better with compression if it allows more
of the working set to fit in memory.</p>
<h2>Note about Ext4 Filesystems</h2>
<p>The preceding numbers are for an ext3 file system. Synchronous writes are much slower under <a href="http://en.wikipedia.org/wiki/Ext4">ext4</a> (LevelDB drops to ~31 writes / second and TreeDB drops to ~5 writes / second; SQLite3's synchronous writes do not noticeably drop) due to ext4's different handling of <span class="code">fsync</span> / <span class="code">msync</span> calls. Even LevelDB's asynchronous write performance drops somewhat since it spreads its storage across multiple files and issues <span class="code">fsync</span> calls when switching to a new file.</p>
<h2>Acknowledgements</h2>
<p>Jeff Dean and Sanjay Ghemawat wrote LevelDB. Kevin Tseng wrote and compiled these benchmarks. Mikio Hirabayashi, Scott Hess, and Gabor Cselle provided help and advice.</p>
</body>
</html>

89
src/leveldb/doc/doc.css Normal file
View File

@@ -0,0 +1,89 @@
body {
margin-left: 0.5in;
margin-right: 0.5in;
background: white;
color: black;
}
h1 {
margin-left: -0.2in;
font-size: 14pt;
}
h2 {
margin-left: -0in;
font-size: 12pt;
}
h3 {
margin-left: -0in;
}
h4 {
margin-left: -0in;
}
hr {
margin-left: -0in;
}
/* Definition lists: definition term bold */
dt {
font-weight: bold;
}
address {
text-align: center;
}
code,samp,var {
color: blue;
}
kbd {
color: #600000;
}
div.note p {
float: right;
width: 3in;
margin-right: 0%;
padding: 1px;
border: 2px solid #6060a0;
background-color: #fffff0;
}
ul {
margin-top: -0em;
margin-bottom: -0em;
}
ol {
margin-top: -0em;
margin-bottom: -0em;
}
UL.nobullets {
list-style-type: none;
list-style-image: none;
margin-left: -1em;
}
p {
margin: 1em 0 1em 0;
padding: 0 0 0 0;
}
pre {
line-height: 1.3em;
padding: 0.4em 0 0.8em 0;
margin: 0 0 0 0;
border: 0 0 0 0;
color: blue;
}
.datatable {
margin-left: auto;
margin-right: auto;
margin-top: 2em;
margin-bottom: 2em;
border: 1px solid;
}
.datatable td,th {
padding: 0 0.5em 0 0.5em;
text-align: right;
}

213
src/leveldb/doc/impl.html Normal file
View File

@@ -0,0 +1,213 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb file layout and compactions</title>
</head>
<body>
<h1>Files</h1>
The implementation of leveldb is similar in spirit to the
representation of a single
<a href="http://labs.google.com/papers/bigtable.html">
Bigtable tablet (section 5.3)</a>.
However the organization of the files that make up the representation
is somewhat different and is explained below.
<p>
Each database is represented by a set of files stored in a directory.
There are several different types of files as documented below:
<p>
<h2>Log files</h2>
<p>
A log file (*.log) stores a sequence of recent updates. Each update
is appended to the current log file. When the log file reaches a
pre-determined size (approximately 4MB by default), it is converted
to a sorted table (see below) and a new log file is created for future
updates.
<p>
A copy of the current log file is kept in an in-memory structure (the
<code>memtable</code>). This copy is consulted on every read so that read
operations reflect all logged updates.
<p>
<h2>Sorted tables</h2>
<p>
A sorted table (*.sst) stores a sequence of entries sorted by key.
Each entry is either a value for the key, or a deletion marker for the
key. (Deletion markers are kept around to hide obsolete values
present in older sorted tables).
<p>
The set of sorted tables are organized into a sequence of levels. The
sorted table generated from a log file is placed in a special <code>young</code>
level (also called level-0). When the number of young files exceeds a
certain threshold (currently four), all of the young files are merged
together with all of the overlapping level-1 files to produce a
sequence of new level-1 files (we create a new level-1 file for every
2MB of data.)
<p>
Files in the young level may contain overlapping keys. However files
in other levels have distinct non-overlapping key ranges. Consider
level number L where L >= 1. When the combined size of files in
level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
...), one file in level-L, and all of the overlapping files in
level-(L+1) are merged to form a set of new files for level-(L+1).
These merges have the effect of gradually migrating new updates from
the young level to the largest level using only bulk reads and writes
(i.e., minimizing expensive seeks).
<h2>Manifest</h2>
<p>
A MANIFEST file lists the set of sorted tables that make up each
level, the corresponding key ranges, and other important metadata.
A new MANIFEST file (with a new number embedded in the file name)
is created whenever the database is reopened. The MANIFEST file is
formatted as a log, and changes made to the serving state (as files
are added or removed) are appended to this log.
<p>
<h2>Current</h2>
<p>
CURRENT is a simple text file that contains the name of the latest
MANIFEST file.
<p>
<h2>Info logs</h2>
<p>
Informational messages are printed to files named LOG and LOG.old.
<p>
<h2>Others</h2>
<p>
Other files used for miscellaneous purposes may also be present
(LOCK, *.dbtmp).
<h1>Level 0</h1>
When the log file grows above a certain size (1MB by default):
<ul>
<li>Create a brand new memtable and log file and direct future updates here
<li>In the background:
<ul>
<li>Write the contents of the previous memtable to an sstable
<li>Discard the memtable
<li>Delete the old log file and the old memtable
<li>Add the new sstable to the young (level-0) level.
</ul>
</ul>
<h1>Compactions</h1>
<p>
When the size of level L exceeds its limit, we compact it in a
background thread. The compaction picks a file from level L and all
overlapping files from the next level L+1. Note that if a level-L
file overlaps only part of a level-(L+1) file, the entire file at
level-(L+1) is used as an input to the compaction and will be
discarded after the compaction. Aside: because level-0 is special
(files in it may overlap each other), we treat compactions from
level-0 to level-1 specially: a level-0 compaction may pick more than
one level-0 file in case some of these files overlap each other.
<p>
A compaction merges the contents of the picked files to produce a
sequence of level-(L+1) files. We switch to producing a new
level-(L+1) file after the current output file has reached the target
file size (2MB). We also switch to a new output file when the key
range of the current output file has grown enough to overlap more then
ten level-(L+2) files. This last rule ensures that a later compaction
of a level-(L+1) file will not pick up too much data from level-(L+2).
<p>
The old files are discarded and the new files are added to the serving
state.
<p>
Compactions for a particular level rotate through the key space. In
more detail, for each level L, we remember the ending key of the last
compaction at level L. The next compaction for level L will pick the
first file that starts after this key (wrapping around to the
beginning of the key space if there is no such file).
<p>
Compactions drop overwritten values. They also drop deletion markers
if there are no higher numbered levels that contain a file whose range
overlaps the current key.
<h2>Timing</h2>
Level-0 compactions will read up to four 1MB files from level-0, and
at worst all the level-1 files (10MB). I.e., we will read 14MB and
write 14MB.
<p>
Other than the special level-0 compactions, we will pick one 2MB file
from level L. In the worst case, this will overlap ~ 12 files from
level L+1 (10 because level-(L+1) is ten times the size of level-L,
and another two at the boundaries since the file ranges at level-L
will usually not be aligned with the file ranges at level-L+1). The
compaction will therefore read 26MB and write 26MB. Assuming a disk
IO rate of 100MB/s (ballpark range for modern drives), the worst
compaction cost will be approximately 0.5 second.
<p>
If we throttle the background writing to something small, say 10% of
the full 100MB/s speed, a compaction may take up to 5 seconds. If the
user is writing at 10MB/s, we might build up lots of level-0 files
(~50 to hold the 5*10MB). This may signficantly increase the cost of
reads due to the overhead of merging more files together on every
read.
<p>
Solution 1: To reduce this problem, we might want to increase the log
switching threshold when the number of level-0 files is large. Though
the downside is that the larger this threshold, the more memory we will
need to hold the corresponding memtable.
<p>
Solution 2: We might want to decrease write rate artificially when the
number of level-0 files goes up.
<p>
Solution 3: We work on reducing the cost of very wide merges.
Perhaps most of the level-0 files will have their blocks sitting
uncompressed in the cache and we will only need to worry about the
O(N) complexity in the merging iterator.
<h2>Number of files</h2>
Instead of always making 2MB files, we could make larger files for
larger levels to reduce the total file count, though at the expense of
more bursty compactions. Alternatively, we could shard the set of
files into multiple directories.
<p>
An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
the following timings to do 100K file opens in directories with
varying number of files:
<table class="datatable">
<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
<tr><td>1000</td><td>9</td>
<tr><td>10000</td><td>10</td>
<tr><td>100000</td><td>16</td>
</table>
So maybe even the sharding is not necessary on modern filesystems?
<h1>Recovery</h1>
<ul>
<li> Read CURRENT to find name of the latest committed MANIFEST
<li> Read the named MANIFEST file
<li> Clean up stale files
<li> We could open all sstables here, but it is probably better to be lazy...
<li> Convert log chunk to a new level-0 sstable
<li> Start directing new writes to a new log file with recovered sequence#
</ul>
<h1>Garbage collection of files</h1>
<code>DeleteObsoleteFiles()</code> is called at the end of every
compaction and at the end of recovery. It finds the names of all
files in the database. It deletes all log files that are not the
current log file. It deletes all table files that are not referenced
from some level and are not the output of an active compaction.
</body>
</html>

549
src/leveldb/doc/index.html Normal file
View File

@@ -0,0 +1,549 @@
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" type="text/css" href="doc.css" />
<title>Leveldb</title>
</head>
<body>
<h1>Leveldb</h1>
<address>Jeff Dean, Sanjay Ghemawat</address>
<p>
The <code>leveldb</code> library provides a persistent key value store. Keys and
values are arbitrary byte arrays. The keys are ordered within the key
value store according to a user-specified comparator function.
<p>
<h1>Opening A Database</h1>
<p>
A <code>leveldb</code> database has a name which corresponds to a file system
directory. All of the contents of database are stored in this
directory. The following example shows how to open a database,
creating it if necessary:
<p>
<pre>
#include &lt;assert&gt;
#include "leveldb/db.h"
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
assert(status.ok());
...
</pre>
If you want to raise an error if the database already exists, add
the following line before the <code>leveldb::DB::Open</code> call:
<pre>
options.error_if_exists = true;
</pre>
<h1>Status</h1>
<p>
You may have noticed the <code>leveldb::Status</code> type above. Values of this
type are returned by most functions in <code>leveldb</code> that may encounter an
error. You can check if such a result is ok, and also print an
associated error message:
<p>
<pre>
leveldb::Status s = ...;
if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
</pre>
<h1>Closing A Database</h1>
<p>
When you are done with a database, just delete the database object.
Example:
<p>
<pre>
... open the db as described above ...
... do something with db ...
delete db;
</pre>
<h1>Reads And Writes</h1>
<p>
The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
modify/query the database. For example, the following code
moves the value stored under key1 to key2.
<pre>
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
</pre>
<h1>Atomic Updates</h1>
<p>
Note that if the process dies after the Put of key2 but before the
delete of key1, the same value may be left stored under multiple keys.
Such problems can be avoided by using the <code>WriteBatch</code> class to
atomically apply a set of updates:
<p>
<pre>
#include "leveldb/write_batch.h"
...
std::string value;
leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
if (s.ok()) {
leveldb::WriteBatch batch;
batch.Delete(key1);
batch.Put(key2, value);
s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
}
</pre>
The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
and these edits within the batch are applied in order. Note that we
called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
we do not end up erroneously dropping the value entirely.
<p>
Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
speed up bulk updates by placing lots of individual mutations into the
same batch.
<h1>Synchronous Writes</h1>
By default, each write to <code>leveldb</code> is asynchronous: it
returns after pushing the write from the process into the operating
system. The transfer from operating system memory to the underlying
persistent storage happens asynchronously. The <code>sync</code> flag
can be turned on for a particular write to make the write operation
not return until the data being written has been pushed all the way to
persistent storage. (On Posix systems, this is implemented by calling
either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
<code>msync(..., MS_SYNC)</code> before the write operation returns.)
<pre>
leveldb::WriteOptions write_options;
write_options.sync = true;
db-&gt;Put(write_options, ...);
</pre>
Asynchronous writes are often more than a thousand times as fast as
synchronous writes. The downside of asynchronous writes is that a
crash of the machine may cause the last few updates to be lost. Note
that a crash of just the writing process (i.e., not a reboot) will not
cause any loss since even when <code>sync</code> is false, an update
is pushed from the process memory into the operating system before it
is considered done.
<p>
Asynchronous writes can often be used safely. For example, when
loading a large amount of data into the database you can handle lost
updates by restarting the bulk load after a crash. A hybrid scheme is
also possible where every Nth write is synchronous, and in the event
of a crash, the bulk load is restarted just after the last synchronous
write finished by the previous run. (The synchronous write can update
a marker that describes where to restart on a crash.)
<p>
<code>WriteBatch</code> provides an alternative to asynchronous writes.
Multiple updates may be placed in the same <code>WriteBatch</code> and
applied together using a synchronous write (i.e.,
<code>write_options.sync</code> is set to true). The extra cost of
the synchronous write will be amortized across all of the writes in
the batch.
<p>
<h1>Concurrency</h1>
<p>
A database may only be opened by one process at a time.
The <code>leveldb</code> implementation acquires a lock from the
operating system to prevent misuse. Within a single process, the
same <code>leveldb::DB</code> object may be safely shared by multiple
concurrent threads. I.e., different threads may write into or fetch
iterators or call <code>Get</code> on the same database without any
external synchronization (the leveldb implementation will
automatically do the required synchronization). However other objects
(like Iterator and WriteBatch) may require external synchronization.
If two threads share such an object, they must protect access to it
using their own locking protocol. More details are available in
the public header files.
<p>
<h1>Iteration</h1>
<p>
The following example demonstrates how to print all key,value pairs
in a database.
<p>
<pre>
leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": " &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
}
assert(it-&gt;status().ok()); // Check for any errors found during the scan
delete it;
</pre>
The following variation shows how to process just the keys in the
range <code>[start,limit)</code>:
<p>
<pre>
for (it-&gt;Seek(start);
it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
it-&gt;Next()) {
...
}
</pre>
You can also process entries in reverse order. (Caveat: reverse
iteration may be somewhat slower than forward iteration.)
<p>
<pre>
for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
...
}
</pre>
<h1>Snapshots</h1>
<p>
Snapshots provide consistent read-only views over the entire state of
the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
that a read should operate on a particular version of the DB state.
If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
implicit snapshot of the current state.
<p>
Snapshots are created by the DB::GetSnapshot() method:
<p>
<pre>
leveldb::ReadOptions options;
options.snapshot = db-&gt;GetSnapshot();
... apply some updates to db ...
leveldb::Iterator* iter = db-&gt;NewIterator(options);
... read using iter to view the state when the snapshot was created ...
delete iter;
db-&gt;ReleaseSnapshot(options.snapshot);
</pre>
Note that when a snapshot is no longer needed, it should be released
using the DB::ReleaseSnapshot interface. This allows the
implementation to get rid of state that was being maintained just to
support reading as of that snapshot.
<h1>Slice</h1>
<p>
The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
structure that contains a length and a pointer to an external byte
array. Returning a <code>Slice</code> is a cheaper alternative to returning a
<code>std::string</code> since we do not need to copy potentially large keys and
values. In addition, <code>leveldb</code> methods do not return null-terminated
C-style strings since <code>leveldb</code> keys and values are allowed to
contain '\0' bytes.
<p>
C++ strings and null-terminated C-style strings can be easily converted
to a Slice:
<p>
<pre>
leveldb::Slice s1 = "hello";
std::string str("world");
leveldb::Slice s2 = str;
</pre>
A Slice can be easily converted back to a C++ string:
<pre>
std::string str = s1.ToString();
assert(str == std::string("hello"));
</pre>
Be careful when using Slices since it is up to the caller to ensure that
the external byte array into which the Slice points remains live while
the Slice is in use. For example, the following is buggy:
<p>
<pre>
leveldb::Slice slice;
if (...) {
std::string str = ...;
slice = str;
}
Use(slice);
</pre>
When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
backing storage for <code>slice</code> will disappear.
<p>
<h1>Comparators</h1>
<p>
The preceding examples used the default ordering function for key,
which orders bytes lexicographically. You can however supply a custom
comparator when opening a database. For example, suppose each
database key consists of two numbers and we should sort by the first
number, breaking ties by the second number. First, define a proper
subclass of <code>leveldb::Comparator</code> that expresses these rules:
<p>
<pre>
class TwoPartComparator : public leveldb::Comparator {
public:
// Three-way comparison function:
// if a &lt; b: negative result
// if a &gt; b: positive result
// else: zero result
int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
int a1, a2, b1, b2;
ParseKey(a, &amp;a1, &amp;a2);
ParseKey(b, &amp;b1, &amp;b2);
if (a1 &lt; b1) return -1;
if (a1 &gt; b1) return +1;
if (a2 &lt; b2) return -1;
if (a2 &gt; b2) return +1;
return 0;
}
// Ignore the following methods for now:
const char* Name() const { return "TwoPartComparator"; }
void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
void FindShortSuccessor(std::string*) const { }
};
</pre>
Now create a database using this custom comparator:
<p>
<pre>
TwoPartComparator cmp;
leveldb::DB* db;
leveldb::Options options;
options.create_if_missing = true;
options.comparator = &amp;cmp;
leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
...
</pre>
<h2>Backwards compatibility</h2>
<p>
The result of the comparator's <code>Name</code> method is attached to the
database when it is created, and is checked on every subsequent
database open. If the name changes, the <code>leveldb::DB::Open</code> call will
fail. Therefore, change the name if and only if the new key format
and comparison function are incompatible with existing databases, and
it is ok to discard the contents of all existing databases.
<p>
You can however still gradually evolve your key format over time with
a little bit of pre-planning. For example, you could store a version
number at the end of each key (one byte should suffice for most uses).
When you wish to switch to a new key format (e.g., adding an optional
third part to the keys processed by <code>TwoPartComparator</code>),
(a) keep the same comparator name (b) increment the version number
for new keys (c) change the comparator function so it uses the
version numbers found in the keys to decide how to interpret them.
<p>
<h1>Performance</h1>
<p>
Performance can be tuned by changing the default values of the
types defined in <code>include/leveldb/options.h</code>.
<p>
<h2>Block size</h2>
<p>
<code>leveldb</code> groups adjacent keys together into the same block and such a
block is the unit of transfer to and from persistent storage. The
default block size is approximately 4096 uncompressed bytes.
Applications that mostly do bulk scans over the contents of the
database may wish to increase this size. Applications that do a lot
of point reads of small values may wish to switch to a smaller block
size if performance measurements indicate an improvement. There isn't
much benefit in using blocks smaller than one kilobyte, or larger than
a few megabytes. Also note that compression will be more effective
with larger block sizes.
<p>
<h2>Compression</h2>
<p>
Each block is individually compressed before being written to
persistent storage. Compression is on by default since the default
compression method is very fast, and is automatically disabled for
uncompressible data. In rare cases, applications may want to disable
compression entirely, but should only do so if benchmarks show a
performance improvement:
<p>
<pre>
leveldb::Options options;
options.compression = leveldb::kNoCompression;
... leveldb::DB::Open(options, name, ...) ....
</pre>
<h2>Cache</h2>
<p>
The contents of the database are stored in a set of files in the
filesystem and each file stores a sequence of compressed blocks. If
<code>options.cache</code> is non-NULL, it is used to cache frequently used
uncompressed block contents.
<p>
<pre>
#include "leveldb/cache.h"
leveldb::Options options;
options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
leveldb::DB* db;
leveldb::DB::Open(options, name, &db);
... use the db ...
delete db
delete options.cache;
</pre>
Note that the cache holds uncompressed data, and therefore it should
be sized according to application level data sizes, without any
reduction from compression. (Caching of compressed blocks is left to
the operating system buffer cache, or any custom <code>Env</code>
implementation provided by the client.)
<p>
When performing a bulk read, the application may wish to disable
caching so that the data processed by the bulk read does not end up
displacing most of the cached contents. A per-iterator option can be
used to achieve this:
<p>
<pre>
leveldb::ReadOptions options;
options.fill_cache = false;
leveldb::Iterator* it = db-&gt;NewIterator(options);
for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
...
}
</pre>
<h2>Key Layout</h2>
<p>
Note that the unit of disk transfer and caching is a block. Adjacent
keys (according to the database sort order) will usually be placed in
the same block. Therefore the application can improve its performance
by placing keys that are accessed together near each other and placing
infrequently used keys in a separate region of the key space.
<p>
For example, suppose we are implementing a simple file system on top
of <code>leveldb</code>. The types of entries we might wish to store are:
<p>
<pre>
filename -&gt; permission-bits, length, list of file_block_ids
file_block_id -&gt; data
</pre>
We might want to prefix <code>filename</code> keys with one letter (say '/') and the
<code>file_block_id</code> keys with a different letter (say '0') so that scans
over just the metadata do not force us to fetch and cache bulky file
contents.
<p>
<h2>Filters</h2>
<p>
Because of the way <code>leveldb</code> data is organized on disk,
a single <code>Get()</code> call may involve multiple reads from disk.
The optional <code>FilterPolicy</code> mechanism can be used to reduce
the number of disk reads substantially.
<pre>
leveldb::Options options;
options.filter_policy = NewBloomFilterPolicy(10);
leveldb::DB* db;
leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
... use the database ...
delete db;
delete options.filter_policy;
</pre>
The preceding code associates a
<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
based filtering policy with the database. Bloom filter based
filtering relies on keeping some number of bits of data in memory per
key (in this case 10 bits per key since that is the argument we passed
to NewBloomFilterPolicy). This filter will reduce the number of unnecessary
disk reads needed for <code>Get()</code> calls by a factor of
approximately a 100. Increasing the bits per key will lead to a
larger reduction at the cost of more memory usage. We recommend that
applications whose working set does not fit in memory and that do a
lot of random reads set a filter policy.
<p>
If you are using a custom comparator, you should ensure that the filter
policy you are using is compatible with your comparator. For example,
consider a comparator that ignores trailing spaces when comparing keys.
<code>NewBloomFilterPolicy</code> must not be used with such a comparator.
Instead, the application should provide a custom filter policy that
also ignores trailing spaces. For example:
<pre>
class CustomFilterPolicy : public leveldb::FilterPolicy {
private:
FilterPolicy* builtin_policy_;
public:
CustomFilterPolicy() : builtin_policy_(NewBloomFilterPolicy(10)) { }
~CustomFilterPolicy() { delete builtin_policy_; }
const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
void CreateFilter(const Slice* keys, int n, std::string* dst) const {
// Use builtin bloom filter code after removing trailing spaces
std::vector&lt;Slice&gt; trimmed(n);
for (int i = 0; i &lt; n; i++) {
trimmed[i] = RemoveTrailingSpaces(keys[i]);
}
return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
}
bool KeyMayMatch(const Slice& key, const Slice& filter) const {
// Use builtin bloom filter code after removing trailing spaces
return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
}
};
</pre>
<p>
Advanced applications may provide a filter policy that does not use
a bloom filter but uses some other mechanism for summarizing a set
of keys. See <code>leveldb/filter_policy.h</code> for detail.
<p>
<h1>Checksums</h1>
<p>
<code>leveldb</code> associates checksums with all data it stores in the file system.
There are two separate controls provided over how aggressively these
checksums are verified:
<p>
<ul>
<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
checksum verification of all data that is read from the file system on
behalf of a particular read. By default, no such verification is
done.
<p>
<li> <code>Options::paranoid_checks</code> may be set to true before opening a
database to make the database implementation raise an error as soon as
it detects an internal corruption. Depending on which portion of the
database has been corrupted, the error may be raised when the database
is opened, or later by another database operation. By default,
paranoid checking is off so that the database can be used even if
parts of its persistent storage have been corrupted.
<p>
If a database is corrupted (perhaps it cannot be opened when
paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
may be used to recover as much of the data as possible
<p>
</ul>
<h1>Approximate Sizes</h1>
<p>
The <code>GetApproximateSizes</code> method can used to get the approximate
number of bytes of file system space used by one or more key ranges.
<p>
<pre>
leveldb::Range ranges[2];
ranges[0] = leveldb::Range("a", "c");
ranges[1] = leveldb::Range("x", "z");
uint64_t sizes[2];
leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
</pre>
The preceding call will set <code>sizes[0]</code> to the approximate number of
bytes of file system space used by the key range <code>[a..c)</code> and
<code>sizes[1]</code> to the approximate number of bytes used by the key range
<code>[x..z)</code>.
<p>
<h1>Environment</h1>
<p>
All file operations (and other operating system calls) issued by the
<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
Sophisticated clients may wish to provide their own <code>Env</code>
implementation to get better control. For example, an application may
introduce artificial delays in the file IO paths to limit the impact
of <code>leveldb</code> on other activities in the system.
<p>
<pre>
class SlowEnv : public leveldb::Env {
.. implementation of the Env interface ...
};
SlowEnv env;
leveldb::Options options;
options.env = &amp;env;
Status s = leveldb::DB::Open(options, ...);
</pre>
<h1>Porting</h1>
<p>
<code>leveldb</code> may be ported to a new platform by providing platform
specific implementations of the types/methods/functions exported by
<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
details.
<p>
In addition, the new platform may need a new default <code>leveldb::Env</code>
implementation. See <code>leveldb/util/env_posix.h</code> for an example.
<h1>Other Information</h1>
<p>
Details about the <code>leveldb</code> implementation may be found in
the following documents:
<ul>
<li> <a href="impl.html">Implementation notes</a>
<li> <a href="table_format.txt">Format of an immutable Table file</a>
<li> <a href="log_format.txt">Format of a log file</a>
</ul>
</body>
</html>

View File

@@ -0,0 +1,75 @@
The log file contents are a sequence of 32KB blocks. The only
exception is that the tail of the file may contain a partial block.
Each block consists of a sequence of records:
block := record* trailer?
record :=
checksum: uint32 // crc32c of type and data[] ; little-endian
length: uint16 // little-endian
type: uint8 // One of FULL, FIRST, MIDDLE, LAST
data: uint8[length]
A record never starts within the last six bytes of a block (since it
won't fit). Any leftover bytes here form the trailer, which must
consist entirely of zero bytes and must be skipped by readers.
Aside: if exactly seven bytes are left in the current block, and a new
non-zero length record is added, the writer must emit a FIRST record
(which contains zero bytes of user data) to fill up the trailing seven
bytes of the block and then emit all of the user data in subsequent
blocks.
More types may be added in the future. Some Readers may skip record
types they do not understand, others may report that some data was
skipped.
FULL == 1
FIRST == 2
MIDDLE == 3
LAST == 4
The FULL record contains the contents of an entire user record.
FIRST, MIDDLE, LAST are types used for user records that have been
split into multiple fragments (typically because of block boundaries).
FIRST is the type of the first fragment of a user record, LAST is the
type of the last fragment of a user record, and MID is the type of all
interior fragments of a user record.
Example: consider a sequence of user records:
A: length 1000
B: length 97270
C: length 8000
A will be stored as a FULL record in the first block.
B will be split into three fragments: first fragment occupies the rest
of the first block, second fragment occupies the entirety of the
second block, and the third fragment occupies a prefix of the third
block. This will leave six bytes free in the third block, which will
be left empty as the trailer.
C will be stored as a FULL record in the fourth block.
===================
Some benefits over the recordio format:
(1) We do not need any heuristics for resyncing - just go to next
block boundary and scan. If there is a corruption, skip to the next
block. As a side-benefit, we do not get confused when part of the
contents of one log file are embedded as a record inside another log
file.
(2) Splitting at approximate boundaries (e.g., for mapreduce) is
simple: find the next block boundary and skip records until we
hit a FULL or FIRST record.
(3) We do not need extra buffering for large records.
Some downsides compared to recordio format:
(1) No packing of tiny records. This could be fixed by adding a new
record type, so it is a shortcoming of the current implementation,
not necessarily the format.
(2) No compression. Again, this could be fixed by adding new record types.

View File

@@ -0,0 +1,104 @@
File format
===========
<beginning_of_file>
[data block 1]
[data block 2]
...
[data block N]
[meta block 1]
...
[meta block K]
[metaindex block]
[index block]
[Footer] (fixed size; starts at file_size - sizeof(Footer))
<end_of_file>
The file contains internal pointers. Each such pointer is called
a BlockHandle and contains the following information:
offset: varint64
size: varint64
See https://developers.google.com/protocol-buffers/docs/encoding#varints
for an explanation of varint64 format.
(1) The sequence of key/value pairs in the file are stored in sorted
order and partitioned into a sequence of data blocks. These blocks
come one after another at the beginning of the file. Each data block
is formatted according to the code in block_builder.cc, and then
optionally compressed.
(2) After the data blocks we store a bunch of meta blocks. The
supported meta block types are described below. More meta block types
may be added in the future. Each meta block is again formatted using
block_builder.cc and then optionally compressed.
(3) A "metaindex" block. It contains one entry for every other meta
block where the key is the name of the meta block and the value is a
BlockHandle pointing to that meta block.
(4) An "index" block. This block contains one entry per data block,
where the key is a string >= last key in that data block and before
the first key in the successive data block. The value is the
BlockHandle for the data block.
(6) At the very end of the file is a fixed length footer that contains
the BlockHandle of the metaindex and index blocks as well as a magic number.
metaindex_handle: char[p]; // Block handle for metaindex
index_handle: char[q]; // Block handle for index
padding: char[40-p-q]; // zeroed bytes to make fixed length
// (40==2*BlockHandle::kMaxEncodedLength)
magic: fixed64; // == 0xdb4775248b80fb57 (little-endian)
"filter" Meta Block
-------------------
If a "FilterPolicy" was specified when the database was opened, a
filter block is stored in each table. The "metaindex" block contains
an entry that maps from "filter.<N>" to the BlockHandle for the filter
block where "<N>" is the string returned by the filter policy's
"Name()" method.
The filter block stores a sequence of filters, where filter i contains
the output of FilterPolicy::CreateFilter() on all keys that are stored
in a block whose file offset falls within the range
[ i*base ... (i+1)*base-1 ]
Currently, "base" is 2KB. So for example, if blocks X and Y start in
the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
converted to a filter by calling FilterPolicy::CreateFilter(), and the
resulting filter will be stored as the first filter in the filter
block.
The filter block is formatted as follows:
[filter 0]
[filter 1]
[filter 2]
...
[filter N-1]
[offset of filter 0] : 4 bytes
[offset of filter 1] : 4 bytes
[offset of filter 2] : 4 bytes
...
[offset of filter N-1] : 4 bytes
[offset of beginning of offset array] : 4 bytes
lg(base) : 1 byte
The offset array at the end of the filter block allows efficient
mapping from a data block offset to the corresponding filter.
"stats" Meta Block
------------------
This meta block contains a bunch of stats. The key is the name
of the statistic. The value contains the statistic.
TODO(postrelease): record following stats.
data size
index size
key size (uncompressed)
value size (uncompressed)
number of entries
number of data blocks

View File

@@ -0,0 +1,384 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "helpers/memenv/memenv.h"
#include "leveldb/env.h"
#include "leveldb/status.h"
#include "port/port.h"
#include "util/mutexlock.h"
#include <map>
#include <string.h>
#include <string>
#include <vector>
namespace leveldb {
namespace {
class FileState {
public:
// FileStates are reference counted. The initial reference count is zero
// and the caller must call Ref() at least once.
FileState() : refs_(0), size_(0) {}
// Increase the reference count.
void Ref() {
MutexLock lock(&refs_mutex_);
++refs_;
}
// Decrease the reference count. Delete if this is the last reference.
void Unref() {
bool do_delete = false;
{
MutexLock lock(&refs_mutex_);
--refs_;
assert(refs_ >= 0);
if (refs_ <= 0) {
do_delete = true;
}
}
if (do_delete) {
delete this;
}
}
uint64_t Size() const { return size_; }
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
if (offset > size_) {
return Status::IOError("Offset greater than file size.");
}
const uint64_t available = size_ - offset;
if (n > available) {
n = available;
}
if (n == 0) {
*result = Slice();
return Status::OK();
}
size_t block = offset / kBlockSize;
size_t block_offset = offset % kBlockSize;
if (n <= kBlockSize - block_offset) {
// The requested bytes are all in the first block.
*result = Slice(blocks_[block] + block_offset, n);
return Status::OK();
}
size_t bytes_to_copy = n;
char* dst = scratch;
while (bytes_to_copy > 0) {
size_t avail = kBlockSize - block_offset;
if (avail > bytes_to_copy) {
avail = bytes_to_copy;
}
memcpy(dst, blocks_[block] + block_offset, avail);
bytes_to_copy -= avail;
dst += avail;
block++;
block_offset = 0;
}
*result = Slice(scratch, n);
return Status::OK();
}
Status Append(const Slice& data) {
const char* src = data.data();
size_t src_len = data.size();
while (src_len > 0) {
size_t avail;
size_t offset = size_ % kBlockSize;
if (offset != 0) {
// There is some room in the last block.
avail = kBlockSize - offset;
} else {
// No room in the last block; push new one.
blocks_.push_back(new char[kBlockSize]);
avail = kBlockSize;
}
if (avail > src_len) {
avail = src_len;
}
memcpy(blocks_.back() + offset, src, avail);
src_len -= avail;
src += avail;
size_ += avail;
}
return Status::OK();
}
private:
// Private since only Unref() should be used to delete it.
~FileState() {
for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
++i) {
delete [] *i;
}
}
// No copying allowed.
FileState(const FileState&);
void operator=(const FileState&);
port::Mutex refs_mutex_;
int refs_; // Protected by refs_mutex_;
// The following fields are not protected by any mutex. They are only mutable
// while the file is being written, and concurrent access is not allowed
// to writable files.
std::vector<char*> blocks_;
uint64_t size_;
enum { kBlockSize = 8 * 1024 };
};
class SequentialFileImpl : public SequentialFile {
public:
explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
file_->Ref();
}
~SequentialFileImpl() {
file_->Unref();
}
virtual Status Read(size_t n, Slice* result, char* scratch) {
Status s = file_->Read(pos_, n, result, scratch);
if (s.ok()) {
pos_ += result->size();
}
return s;
}
virtual Status Skip(uint64_t n) {
if (pos_ > file_->Size()) {
return Status::IOError("pos_ > file_->Size()");
}
const size_t available = file_->Size() - pos_;
if (n > available) {
n = available;
}
pos_ += n;
return Status::OK();
}
private:
FileState* file_;
size_t pos_;
};
class RandomAccessFileImpl : public RandomAccessFile {
public:
explicit RandomAccessFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~RandomAccessFileImpl() {
file_->Unref();
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
return file_->Read(offset, n, result, scratch);
}
private:
FileState* file_;
};
class WritableFileImpl : public WritableFile {
public:
WritableFileImpl(FileState* file) : file_(file) {
file_->Ref();
}
~WritableFileImpl() {
file_->Unref();
}
virtual Status Append(const Slice& data) {
return file_->Append(data);
}
virtual Status Close() { return Status::OK(); }
virtual Status Flush() { return Status::OK(); }
virtual Status Sync() { return Status::OK(); }
private:
FileState* file_;
};
class NoOpLogger : public Logger {
public:
virtual void Logv(const char* format, va_list ap) { }
};
class InMemoryEnv : public EnvWrapper {
public:
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
virtual ~InMemoryEnv() {
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
i->second->Unref();
}
}
// Partial implementation of the Env interface.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
*result = new SequentialFileImpl(file_map_[fname]);
return Status::OK();
}
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
*result = NULL;
return Status::IOError(fname, "File not found");
}
*result = new RandomAccessFileImpl(file_map_[fname]);
return Status::OK();
}
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) != file_map_.end()) {
DeleteFileInternal(fname);
}
FileState* file = new FileState();
file->Ref();
file_map_[fname] = file;
*result = new WritableFileImpl(file);
return Status::OK();
}
virtual bool FileExists(const std::string& fname) {
MutexLock lock(&mutex_);
return file_map_.find(fname) != file_map_.end();
}
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) {
MutexLock lock(&mutex_);
result->clear();
for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
const std::string& filename = i->first;
if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
Slice(filename).starts_with(Slice(dir))) {
result->push_back(filename.substr(dir.size() + 1));
}
}
return Status::OK();
}
void DeleteFileInternal(const std::string& fname) {
if (file_map_.find(fname) == file_map_.end()) {
return;
}
file_map_[fname]->Unref();
file_map_.erase(fname);
}
virtual Status DeleteFile(const std::string& fname) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
DeleteFileInternal(fname);
return Status::OK();
}
virtual Status CreateDir(const std::string& dirname) {
return Status::OK();
}
virtual Status DeleteDir(const std::string& dirname) {
return Status::OK();
}
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
MutexLock lock(&mutex_);
if (file_map_.find(fname) == file_map_.end()) {
return Status::IOError(fname, "File not found");
}
*file_size = file_map_[fname]->Size();
return Status::OK();
}
virtual Status RenameFile(const std::string& src,
const std::string& target) {
MutexLock lock(&mutex_);
if (file_map_.find(src) == file_map_.end()) {
return Status::IOError(src, "File not found");
}
DeleteFileInternal(target);
file_map_[target] = file_map_[src];
file_map_.erase(src);
return Status::OK();
}
virtual Status LockFile(const std::string& fname, FileLock** lock) {
*lock = new FileLock;
return Status::OK();
}
virtual Status UnlockFile(FileLock* lock) {
delete lock;
return Status::OK();
}
virtual Status GetTestDirectory(std::string* path) {
*path = "/test";
return Status::OK();
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
*result = new NoOpLogger;
return Status::OK();
}
private:
// Map from filenames to FileState objects, representing a simple file system.
typedef std::map<std::string, FileState*> FileSystem;
port::Mutex mutex_;
FileSystem file_map_; // Protected by mutex_.
};
} // namespace
Env* NewMemEnv(Env* base_env) {
return new InMemoryEnv(base_env);
}
} // namespace leveldb

View File

@@ -0,0 +1,20 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_
#define STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_
namespace leveldb {
class Env;
// Returns a new environment that stores its data in memory and delegates
// all non-file-storage tasks to base_env. The caller must delete the result
// when it is no longer needed.
// *base_env must remain live while the result is in use.
Env* NewMemEnv(Env* base_env);
} // namespace leveldb
#endif // STORAGE_LEVELDB_HELPERS_MEMENV_MEMENV_H_

View File

@@ -0,0 +1,232 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "helpers/memenv/memenv.h"
#include "db/db_impl.h"
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "util/testharness.h"
#include <string>
#include <vector>
namespace leveldb {
class MemEnvTest {
public:
Env* env_;
MemEnvTest()
: env_(NewMemEnv(Env::Default())) {
}
~MemEnvTest() {
delete env_;
}
};
TEST(MemEnvTest, Basics) {
uint64_t file_size;
WritableFile* writable_file;
std::vector<std::string> children;
ASSERT_OK(env_->CreateDir("/dir"));
// Check that the directory is empty.
ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
ASSERT_OK(env_->GetChildren("/dir", &children));
ASSERT_EQ(0, children.size());
// Create a file.
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
delete writable_file;
// Check that the file exists.
ASSERT_TRUE(env_->FileExists("/dir/f"));
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
ASSERT_EQ(0, file_size);
ASSERT_OK(env_->GetChildren("/dir", &children));
ASSERT_EQ(1, children.size());
ASSERT_EQ("f", children[0]);
// Write to the file.
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
ASSERT_OK(writable_file->Append("abc"));
delete writable_file;
// Check for expected size.
ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
ASSERT_EQ(3, file_size);
// Check that renaming works.
ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
ASSERT_TRUE(!env_->FileExists("/dir/f"));
ASSERT_TRUE(env_->FileExists("/dir/g"));
ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
ASSERT_EQ(3, file_size);
// Check that opening non-existent file fails.
SequentialFile* seq_file;
RandomAccessFile* rand_file;
ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file).ok());
ASSERT_TRUE(!seq_file);
ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file).ok());
ASSERT_TRUE(!rand_file);
// Check that deleting works.
ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
ASSERT_OK(env_->DeleteFile("/dir/g"));
ASSERT_TRUE(!env_->FileExists("/dir/g"));
ASSERT_OK(env_->GetChildren("/dir", &children));
ASSERT_EQ(0, children.size());
ASSERT_OK(env_->DeleteDir("/dir"));
}
TEST(MemEnvTest, ReadWrite) {
WritableFile* writable_file;
SequentialFile* seq_file;
RandomAccessFile* rand_file;
Slice result;
char scratch[100];
ASSERT_OK(env_->CreateDir("/dir"));
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
ASSERT_OK(writable_file->Append("hello "));
ASSERT_OK(writable_file->Append("world"));
delete writable_file;
// Read sequentially.
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
ASSERT_EQ(0, result.compare("hello"));
ASSERT_OK(seq_file->Skip(1));
ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
ASSERT_EQ(0, result.compare("world"));
ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
ASSERT_EQ(0, result.size());
ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
ASSERT_OK(seq_file->Read(1000, &result, scratch));
ASSERT_EQ(0, result.size());
delete seq_file;
// Random reads.
ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file));
ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
ASSERT_EQ(0, result.compare("world"));
ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
ASSERT_EQ(0, result.compare("hello"));
ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
ASSERT_EQ(0, result.compare("d"));
// Too high offset.
ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
delete rand_file;
}
TEST(MemEnvTest, Locks) {
FileLock* lock;
// These are no-ops, but we test they return success.
ASSERT_OK(env_->LockFile("some file", &lock));
ASSERT_OK(env_->UnlockFile(lock));
}
TEST(MemEnvTest, Misc) {
std::string test_dir;
ASSERT_OK(env_->GetTestDirectory(&test_dir));
ASSERT_TRUE(!test_dir.empty());
WritableFile* writable_file;
ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file));
// These are no-ops, but we test they return success.
ASSERT_OK(writable_file->Sync());
ASSERT_OK(writable_file->Flush());
ASSERT_OK(writable_file->Close());
delete writable_file;
}
TEST(MemEnvTest, LargeWrite) {
const size_t kWriteSize = 300 * 1024;
char* scratch = new char[kWriteSize * 2];
std::string write_data;
for (size_t i = 0; i < kWriteSize; ++i) {
write_data.append(1, static_cast<char>(i));
}
WritableFile* writable_file;
ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file));
ASSERT_OK(writable_file->Append("foo"));
ASSERT_OK(writable_file->Append(write_data));
delete writable_file;
SequentialFile* seq_file;
Slice result;
ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file));
ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
ASSERT_EQ(0, result.compare("foo"));
size_t read = 0;
std::string read_data;
while (read < kWriteSize) {
ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
read_data.append(result.data(), result.size());
read += result.size();
}
ASSERT_TRUE(write_data == read_data);
delete seq_file;
delete [] scratch;
}
TEST(MemEnvTest, DBTest) {
Options options;
options.create_if_missing = true;
options.env = env_;
DB* db;
const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
ASSERT_OK(DB::Open(options, "/dir/db", &db));
for (size_t i = 0; i < 3; ++i) {
ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
}
for (size_t i = 0; i < 3; ++i) {
std::string res;
ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
ASSERT_TRUE(res == vals[i]);
}
Iterator* iterator = db->NewIterator(ReadOptions());
iterator->SeekToFirst();
for (size_t i = 0; i < 3; ++i) {
ASSERT_TRUE(iterator->Valid());
ASSERT_TRUE(keys[i] == iterator->key());
ASSERT_TRUE(vals[i] == iterator->value());
iterator->Next();
}
ASSERT_TRUE(!iterator->Valid());
delete iterator;
DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
ASSERT_OK(dbi->TEST_CompactMemTable());
for (size_t i = 0; i < 3; ++i) {
std::string res;
ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
ASSERT_TRUE(res == vals[i]);
}
delete db;
}
} // namespace leveldb
int main(int argc, char** argv) {
return leveldb::test::RunAllTests();
}

View File

@@ -0,0 +1,291 @@
/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Use of this source code is governed by a BSD-style license that can be
found in the LICENSE file. See the AUTHORS file for names of contributors.
C bindings for leveldb. May be useful as a stable ABI that can be
used by programs that keep leveldb in a shared library, or for
a JNI api.
Does not support:
. getters for the option types
. custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
(1) We expose just opaque struct pointers and functions to clients.
This allows us to change internal representations without having to
recompile clients.
(2) For simplicity, there is no equivalent to the Slice type. Instead,
the caller has to pass the pointer and length as separate
arguments.
(3) Errors are represented by a null-terminated c string. NULL
means no error. All operations that can raise an error are passed
a "char** errptr" as the last argument. One of the following must
be true on entry:
*errptr == NULL
*errptr points to a malloc()ed null-terminated error message
(On Windows, *errptr must have been malloc()-ed by this library.)
On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message.
(4) Bools have the type unsigned char (0 == false; rest == true)
(5) All of the pointer arguments must be non-NULL.
*/
#ifndef STORAGE_LEVELDB_INCLUDE_C_H_
#define STORAGE_LEVELDB_INCLUDE_C_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
/* Exported types */
typedef struct leveldb_t leveldb_t;
typedef struct leveldb_cache_t leveldb_cache_t;
typedef struct leveldb_comparator_t leveldb_comparator_t;
typedef struct leveldb_env_t leveldb_env_t;
typedef struct leveldb_filelock_t leveldb_filelock_t;
typedef struct leveldb_filterpolicy_t leveldb_filterpolicy_t;
typedef struct leveldb_iterator_t leveldb_iterator_t;
typedef struct leveldb_logger_t leveldb_logger_t;
typedef struct leveldb_options_t leveldb_options_t;
typedef struct leveldb_randomfile_t leveldb_randomfile_t;
typedef struct leveldb_readoptions_t leveldb_readoptions_t;
typedef struct leveldb_seqfile_t leveldb_seqfile_t;
typedef struct leveldb_snapshot_t leveldb_snapshot_t;
typedef struct leveldb_writablefile_t leveldb_writablefile_t;
typedef struct leveldb_writebatch_t leveldb_writebatch_t;
typedef struct leveldb_writeoptions_t leveldb_writeoptions_t;
/* DB operations */
extern leveldb_t* leveldb_open(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_close(leveldb_t* db);
extern void leveldb_put(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
const char* val, size_t vallen,
char** errptr);
extern void leveldb_delete(
leveldb_t* db,
const leveldb_writeoptions_t* options,
const char* key, size_t keylen,
char** errptr);
extern void leveldb_write(
leveldb_t* db,
const leveldb_writeoptions_t* options,
leveldb_writebatch_t* batch,
char** errptr);
/* Returns NULL if not found. A malloc()ed array otherwise.
Stores the length of the array in *vallen. */
extern char* leveldb_get(
leveldb_t* db,
const leveldb_readoptions_t* options,
const char* key, size_t keylen,
size_t* vallen,
char** errptr);
extern leveldb_iterator_t* leveldb_create_iterator(
leveldb_t* db,
const leveldb_readoptions_t* options);
extern const leveldb_snapshot_t* leveldb_create_snapshot(
leveldb_t* db);
extern void leveldb_release_snapshot(
leveldb_t* db,
const leveldb_snapshot_t* snapshot);
/* Returns NULL if property name is unknown.
Else returns a pointer to a malloc()-ed null-terminated value. */
extern char* leveldb_property_value(
leveldb_t* db,
const char* propname);
extern void leveldb_approximate_sizes(
leveldb_t* db,
int num_ranges,
const char* const* range_start_key, const size_t* range_start_key_len,
const char* const* range_limit_key, const size_t* range_limit_key_len,
uint64_t* sizes);
extern void leveldb_compact_range(
leveldb_t* db,
const char* start_key, size_t start_key_len,
const char* limit_key, size_t limit_key_len);
/* Management operations */
extern void leveldb_destroy_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
extern void leveldb_repair_db(
const leveldb_options_t* options,
const char* name,
char** errptr);
/* Iterator */
extern void leveldb_iter_destroy(leveldb_iterator_t*);
extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
extern void leveldb_iter_next(leveldb_iterator_t*);
extern void leveldb_iter_prev(leveldb_iterator_t*);
extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
/* Write batch */
extern leveldb_writebatch_t* leveldb_writebatch_create();
extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
extern void leveldb_writebatch_put(
leveldb_writebatch_t*,
const char* key, size_t klen,
const char* val, size_t vlen);
extern void leveldb_writebatch_delete(
leveldb_writebatch_t*,
const char* key, size_t klen);
extern void leveldb_writebatch_iterate(
leveldb_writebatch_t*,
void* state,
void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
void (*deleted)(void*, const char* k, size_t klen));
/* Options */
extern leveldb_options_t* leveldb_options_create();
extern void leveldb_options_destroy(leveldb_options_t*);
extern void leveldb_options_set_comparator(
leveldb_options_t*,
leveldb_comparator_t*);
extern void leveldb_options_set_filter_policy(
leveldb_options_t*,
leveldb_filterpolicy_t*);
extern void leveldb_options_set_create_if_missing(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_error_if_exists(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_paranoid_checks(
leveldb_options_t*, unsigned char);
extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
enum {
leveldb_no_compression = 0,
leveldb_snappy_compression = 1
};
extern void leveldb_options_set_compression(leveldb_options_t*, int);
/* Comparator */
extern leveldb_comparator_t* leveldb_comparator_create(
void* state,
void (*destructor)(void*),
int (*compare)(
void*,
const char* a, size_t alen,
const char* b, size_t blen),
const char* (*name)(void*));
extern void leveldb_comparator_destroy(leveldb_comparator_t*);
/* Filter policy */
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
void* state,
void (*destructor)(void*),
char* (*create_filter)(
void*,
const char* const* key_array, const size_t* key_length_array,
int num_keys,
size_t* filter_length),
unsigned char (*key_may_match)(
void*,
const char* key, size_t length,
const char* filter, size_t filter_length),
const char* (*name)(void*));
extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
int bits_per_key);
/* Read options */
extern leveldb_readoptions_t* leveldb_readoptions_create();
extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
extern void leveldb_readoptions_set_verify_checksums(
leveldb_readoptions_t*,
unsigned char);
extern void leveldb_readoptions_set_fill_cache(
leveldb_readoptions_t*, unsigned char);
extern void leveldb_readoptions_set_snapshot(
leveldb_readoptions_t*,
const leveldb_snapshot_t*);
/* Write options */
extern leveldb_writeoptions_t* leveldb_writeoptions_create();
extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
extern void leveldb_writeoptions_set_sync(
leveldb_writeoptions_t*, unsigned char);
/* Cache */
extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
extern void leveldb_cache_destroy(leveldb_cache_t* cache);
/* Env */
extern leveldb_env_t* leveldb_create_default_env();
extern void leveldb_env_destroy(leveldb_env_t*);
/* Utility */
/* Calls free(ptr).
REQUIRES: ptr was malloc()-ed and returned by one of the routines
in this file. Note that in certain cases (typically on Windows), you
may need to call this routine instead of free(ptr) to dispose of
malloc()-ed memory returned by this library. */
extern void leveldb_free(void* ptr);
/* Return the major version number for this release. */
extern int leveldb_major_version();
/* Return the minor version number for this release. */
extern int leveldb_minor_version();
#ifdef __cplusplus
} /* end extern "C" */
#endif
#endif /* STORAGE_LEVELDB_INCLUDE_C_H_ */

View File

@@ -0,0 +1,99 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A Cache is an interface that maps keys to values. It has internal
// synchronization and may be safely accessed concurrently from
// multiple threads. It may automatically evict entries to make room
// for new entries. Values have a specified charge against the cache
// capacity. For example, a cache where the values are variable
// length strings, may use the length of the string as the charge for
// the string.
//
// A builtin cache implementation with a least-recently-used eviction
// policy is provided. Clients may use their own implementations if
// they want something more sophisticated (like scan-resistance, a
// custom eviction policy, variable cache sizing, etc.)
#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_
#define STORAGE_LEVELDB_INCLUDE_CACHE_H_
#include <stdint.h>
#include "leveldb/slice.h"
namespace leveldb {
class Cache;
// Create a new cache with a fixed size capacity. This implementation
// of Cache uses a least-recently-used eviction policy.
extern Cache* NewLRUCache(size_t capacity);
class Cache {
public:
Cache() { }
// Destroys all existing entries by calling the "deleter"
// function that was passed to the constructor.
virtual ~Cache();
// Opaque handle to an entry stored in the cache.
struct Handle { };
// Insert a mapping from key->value into the cache and assign it
// the specified charge against the total cache capacity.
//
// Returns a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
//
// When the inserted entry is no longer needed, the key and
// value will be passed to "deleter".
virtual Handle* Insert(const Slice& key, void* value, size_t charge,
void (*deleter)(const Slice& key, void* value)) = 0;
// If the cache has no mapping for "key", returns NULL.
//
// Else return a handle that corresponds to the mapping. The caller
// must call this->Release(handle) when the returned mapping is no
// longer needed.
virtual Handle* Lookup(const Slice& key) = 0;
// Release a mapping returned by a previous Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void Release(Handle* handle) = 0;
// Return the value encapsulated in a handle returned by a
// successful Lookup().
// REQUIRES: handle must not have been released yet.
// REQUIRES: handle must have been returned by a method on *this.
virtual void* Value(Handle* handle) = 0;
// If the cache contains entry for key, erase it. Note that the
// underlying entry will be kept around until all existing handles
// to it have been released.
virtual void Erase(const Slice& key) = 0;
// Return a new numeric id. May be used by multiple clients who are
// sharing the same cache to partition the key space. Typically the
// client will allocate a new id at startup and prepend the id to
// its cache keys.
virtual uint64_t NewId() = 0;
private:
void LRU_Remove(Handle* e);
void LRU_Append(Handle* e);
void Unref(Handle* e);
struct Rep;
Rep* rep_;
// No copying allowed
Cache(const Cache&);
void operator=(const Cache&);
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_UTIL_CACHE_H_

View File

@@ -0,0 +1,63 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
#include <string>
namespace leveldb {
class Slice;
// A Comparator object provides a total order across slices that are
// used as keys in an sstable or a database. A Comparator implementation
// must be thread-safe since leveldb may invoke its methods concurrently
// from multiple threads.
class Comparator {
public:
virtual ~Comparator();
// Three-way comparison. Returns value:
// < 0 iff "a" < "b",
// == 0 iff "a" == "b",
// > 0 iff "a" > "b"
virtual int Compare(const Slice& a, const Slice& b) const = 0;
// The name of the comparator. Used to check for comparator
// mismatches (i.e., a DB created with one comparator is
// accessed using a different comparator.
//
// The client of this package should switch to a new name whenever
// the comparator implementation changes in a way that will cause
// the relative ordering of any two keys to change.
//
// Names starting with "leveldb." are reserved and should not be used
// by any clients of this package.
virtual const char* Name() const = 0;
// Advanced functions: these are used to reduce the space requirements
// for internal data structures like index blocks.
// If *start < limit, changes *start to a short string in [start,limit).
// Simple comparator implementations may return with *start unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortestSeparator(
std::string* start,
const Slice& limit) const = 0;
// Changes *key to a short string >= *key.
// Simple comparator implementations may return with *key unchanged,
// i.e., an implementation of this method that does nothing is correct.
virtual void FindShortSuccessor(std::string* key) const = 0;
};
// Return a builtin comparator that uses lexicographic byte-wise
// ordering. The result remains the property of this module and
// must not be deleted.
extern const Comparator* BytewiseComparator();
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_

View File

@@ -0,0 +1,161 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
#define STORAGE_LEVELDB_INCLUDE_DB_H_
#include <stdint.h>
#include <stdio.h>
#include "leveldb/iterator.h"
#include "leveldb/options.h"
namespace leveldb {
// Update Makefile if you change these
static const int kMajorVersion = 1;
static const int kMinorVersion = 13;
struct Options;
struct ReadOptions;
struct WriteOptions;
class WriteBatch;
// Abstract handle to particular state of a DB.
// A Snapshot is an immutable object and can therefore be safely
// accessed from multiple threads without any external synchronization.
class Snapshot {
protected:
virtual ~Snapshot();
};
// A range of keys
struct Range {
Slice start; // Included in the range
Slice limit; // Not included in the range
Range() { }
Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
};
// A DB is a persistent ordered map from keys to values.
// A DB is safe for concurrent access from multiple threads without
// any external synchronization.
class DB {
public:
// Open the database with the specified "name".
// Stores a pointer to a heap-allocated database in *dbptr and returns
// OK on success.
// Stores NULL in *dbptr and returns a non-OK status on error.
// Caller should delete *dbptr when it is no longer needed.
static Status Open(const Options& options,
const std::string& name,
DB** dbptr);
DB() { }
virtual ~DB();
// Set the database entry for "key" to "value". Returns OK on success,
// and a non-OK status on error.
// Note: consider setting options.sync = true.
virtual Status Put(const WriteOptions& options,
const Slice& key,
const Slice& value) = 0;
// Remove the database entry (if any) for "key". Returns OK on
// success, and a non-OK status on error. It is not an error if "key"
// did not exist in the database.
// Note: consider setting options.sync = true.
virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
// Apply the specified updates to the database.
// Returns OK on success, non-OK on failure.
// Note: consider setting options.sync = true.
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
// If the database contains an entry for "key" store the
// corresponding value in *value and return OK.
//
// If there is no entry for "key" leave *value unchanged and return
// a status for which Status::IsNotFound() returns true.
//
// May return some other Status on an error.
virtual Status Get(const ReadOptions& options,
const Slice& key, std::string* value) = 0;
// Return a heap-allocated iterator over the contents of the database.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// Caller should delete the iterator when it is no longer needed.
// The returned iterator should be deleted before this db is deleted.
virtual Iterator* NewIterator(const ReadOptions& options) = 0;
// Return a handle to the current DB state. Iterators created with
// this handle will all observe a stable snapshot of the current DB
// state. The caller must call ReleaseSnapshot(result) when the
// snapshot is no longer needed.
virtual const Snapshot* GetSnapshot() = 0;
// Release a previously acquired snapshot. The caller must not
// use "snapshot" after this call.
virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
// DB implementations can export properties about their state
// via this method. If "property" is a valid property understood by this
// DB implementation, fills "*value" with its current value and returns
// true. Otherwise returns false.
//
//
// Valid property names include:
//
// "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
// where <N> is an ASCII representation of a level number (e.g. "0").
// "leveldb.stats" - returns a multi-line string that describes statistics
// about the internal operation of the DB.
// "leveldb.sstables" - returns a multi-line string that describes all
// of the sstables that make up the db contents.
virtual bool GetProperty(const Slice& property, std::string* value) = 0;
// For each i in [0,n-1], store in "sizes[i]", the approximate
// file system space used by keys in "[range[i].start .. range[i].limit)".
//
// Note that the returned sizes measure file system space usage, so
// if the user data compresses by a factor of ten, the returned
// sizes will be one-tenth the size of the corresponding user data size.
//
// The results may not include the sizes of recently written data.
virtual void GetApproximateSizes(const Range* range, int n,
uint64_t* sizes) = 0;
// Compact the underlying storage for the key range [*begin,*end].
// In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only
// be invoked by users who understand the underlying implementation.
//
// begin==NULL is treated as a key before all keys in the database.
// end==NULL is treated as a key after all keys in the database.
// Therefore the following call will compact the entire database:
// db->CompactRange(NULL, NULL);
virtual void CompactRange(const Slice* begin, const Slice* end) = 0;
private:
// No copying allowed
DB(const DB&);
void operator=(const DB&);
};
// Destroy the contents of the specified database.
// Be very careful using this method.
Status DestroyDB(const std::string& name, const Options& options);
// If a DB cannot be opened, you may attempt to call this method to
// resurrect as much of the contents of the database as possible.
// Some data may be lost, so be careful when calling this function
// on a database that contains important information.
Status RepairDB(const std::string& dbname, const Options& options);
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_DB_H_

View File

@@ -0,0 +1,333 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An Env is an interface used by the leveldb implementation to access
// operating system functionality like the filesystem etc. Callers
// may wish to provide a custom Env object when opening a database to
// get fine gain control; e.g., to rate limit file system operations.
//
// All Env implementations are safe for concurrent access from
// multiple threads without any external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
#define STORAGE_LEVELDB_INCLUDE_ENV_H_
#include <cstdarg>
#include <string>
#include <vector>
#include <stdint.h>
#include "leveldb/status.h"
namespace leveldb {
class FileLock;
class Logger;
class RandomAccessFile;
class SequentialFile;
class Slice;
class WritableFile;
class Env {
public:
Env() { }
virtual ~Env();
// Return a default environment suitable for the current operating
// system. Sophisticated users may wish to provide their own Env
// implementation instead of relying on this default environment.
//
// The result of Default() belongs to leveldb and must never be deleted.
static Env* Default();
// Create a brand new sequentially-readable file with the specified name.
// On success, stores a pointer to the new file in *result and returns OK.
// On failure stores NULL in *result and returns non-OK. If the file does
// not exist, returns a non-OK status.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewSequentialFile(const std::string& fname,
SequentialFile** result) = 0;
// Create a brand new random access read-only file with the
// specified name. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK. If the file does not exist, returns a non-OK
// status.
//
// The returned file may be concurrently accessed by multiple threads.
virtual Status NewRandomAccessFile(const std::string& fname,
RandomAccessFile** result) = 0;
// Create an object that writes to a new file with the specified
// name. Deletes any existing file with the same name and creates a
// new file. On success, stores a pointer to the new file in
// *result and returns OK. On failure stores NULL in *result and
// returns non-OK.
//
// The returned file will only be accessed by one thread at a time.
virtual Status NewWritableFile(const std::string& fname,
WritableFile** result) = 0;
// Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0;
// Store in *result the names of the children of the specified directory.
// The names are relative to "dir".
// Original contents of *results are dropped.
virtual Status GetChildren(const std::string& dir,
std::vector<std::string>* result) = 0;
// Delete the named file.
virtual Status DeleteFile(const std::string& fname) = 0;
// Create the specified directory.
virtual Status CreateDir(const std::string& dirname) = 0;
// Delete the specified directory.
virtual Status DeleteDir(const std::string& dirname) = 0;
// Store the size of fname in *file_size.
virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
// Rename file src to target.
virtual Status RenameFile(const std::string& src,
const std::string& target) = 0;
// Lock the specified file. Used to prevent concurrent access to
// the same db by multiple processes. On failure, stores NULL in
// *lock and returns non-OK.
//
// On success, stores a pointer to the object that represents the
// acquired lock in *lock and returns OK. The caller should call
// UnlockFile(*lock) to release the lock. If the process exits,
// the lock will be automatically released.
//
// If somebody else already holds the lock, finishes immediately
// with a failure. I.e., this call does not wait for existing locks
// to go away.
//
// May create the named file if it does not already exist.
virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
// Release the lock acquired by a previous successful call to LockFile.
// REQUIRES: lock was returned by a successful LockFile() call
// REQUIRES: lock has not already been unlocked.
virtual Status UnlockFile(FileLock* lock) = 0;
// Arrange to run "(*function)(arg)" once in a background thread.
//
// "function" may run in an unspecified thread. Multiple functions
// added to the same Env may run concurrently in different threads.
// I.e., the caller may not assume that background work items are
// serialized.
virtual void Schedule(
void (*function)(void* arg),
void* arg) = 0;
// Start a new thread, invoking "function(arg)" within the new thread.
// When "function(arg)" returns, the thread will be destroyed.
virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
// *path is set to a temporary directory that can be used for testing. It may
// or many not have just been created. The directory may or may not differ
// between runs of the same process, but subsequent calls will return the
// same directory.
virtual Status GetTestDirectory(std::string* path) = 0;
// Create and return a log file for storing informational messages.
virtual Status NewLogger(const std::string& fname, Logger** result) = 0;
// Returns the number of micro-seconds since some fixed point in time. Only
// useful for computing deltas of time.
virtual uint64_t NowMicros() = 0;
// Sleep/delay the thread for the perscribed number of micro-seconds.
virtual void SleepForMicroseconds(int micros) = 0;
private:
// No copying allowed
Env(const Env&);
void operator=(const Env&);
};
// A file abstraction for reading sequentially through a file
class SequentialFile {
public:
SequentialFile() { }
virtual ~SequentialFile();
// Read up to "n" bytes from the file. "scratch[0..n-1]" may be
// written by this routine. Sets "*result" to the data that was
// read (including if fewer than "n" bytes were successfully read).
// May set "*result" to point at data in "scratch[0..n-1]", so
// "scratch[0..n-1]" must be live when "*result" is used.
// If an error was encountered, returns a non-OK status.
//
// REQUIRES: External synchronization
virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
// Skip "n" bytes from the file. This is guaranteed to be no
// slower that reading the same data, but may be faster.
//
// If end of file is reached, skipping will stop at the end of the
// file, and Skip will return OK.
//
// REQUIRES: External synchronization
virtual Status Skip(uint64_t n) = 0;
private:
// No copying allowed
SequentialFile(const SequentialFile&);
void operator=(const SequentialFile&);
};
// A file abstraction for randomly reading the contents of a file.
class RandomAccessFile {
public:
RandomAccessFile() { }
virtual ~RandomAccessFile();
// Read up to "n" bytes from the file starting at "offset".
// "scratch[0..n-1]" may be written by this routine. Sets "*result"
// to the data that was read (including if fewer than "n" bytes were
// successfully read). May set "*result" to point at data in
// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
// "*result" is used. If an error was encountered, returns a non-OK
// status.
//
// Safe for concurrent use by multiple threads.
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const = 0;
private:
// No copying allowed
RandomAccessFile(const RandomAccessFile&);
void operator=(const RandomAccessFile&);
};
// A file abstraction for sequential writing. The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class WritableFile {
public:
WritableFile() { }
virtual ~WritableFile();
virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0;
virtual Status Flush() = 0;
virtual Status Sync() = 0;
private:
// No copying allowed
WritableFile(const WritableFile&);
void operator=(const WritableFile&);
};
// An interface for writing log messages.
class Logger {
public:
Logger() { }
virtual ~Logger();
// Write an entry to the log file with the specified format.
virtual void Logv(const char* format, va_list ap) = 0;
private:
// No copying allowed
Logger(const Logger&);
void operator=(const Logger&);
};
// Identifies a locked file.
class FileLock {
public:
FileLock() { }
virtual ~FileLock();
private:
// No copying allowed
FileLock(const FileLock&);
void operator=(const FileLock&);
};
// Log the specified data to *info_log if info_log is non-NULL.
extern void Log(Logger* info_log, const char* format, ...)
# if defined(__GNUC__) || defined(__clang__)
__attribute__((__format__ (__printf__, 2, 3)))
# endif
;
// A utility routine: write "data" to the named file.
extern Status WriteStringToFile(Env* env, const Slice& data,
const std::string& fname);
// A utility routine: read contents of named file into *data
extern Status ReadFileToString(Env* env, const std::string& fname,
std::string* data);
// An implementation of Env that forwards all calls to another Env.
// May be useful to clients who wish to override just part of the
// functionality of another Env.
class EnvWrapper : public Env {
public:
// Initialize an EnvWrapper that delegates all calls to *t
explicit EnvWrapper(Env* t) : target_(t) { }
virtual ~EnvWrapper();
// Return the target to which this Env forwards all calls
Env* target() const { return target_; }
// The following text is boilerplate that forwards all methods to target()
Status NewSequentialFile(const std::string& f, SequentialFile** r) {
return target_->NewSequentialFile(f, r);
}
Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
return target_->NewRandomAccessFile(f, r);
}
Status NewWritableFile(const std::string& f, WritableFile** r) {
return target_->NewWritableFile(f, r);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r);
}
Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
Status GetFileSize(const std::string& f, uint64_t* s) {
return target_->GetFileSize(f, s);
}
Status RenameFile(const std::string& s, const std::string& t) {
return target_->RenameFile(s, t);
}
Status LockFile(const std::string& f, FileLock** l) {
return target_->LockFile(f, l);
}
Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
void Schedule(void (*f)(void*), void* a) {
return target_->Schedule(f, a);
}
void StartThread(void (*f)(void*), void* a) {
return target_->StartThread(f, a);
}
virtual Status GetTestDirectory(std::string* path) {
return target_->GetTestDirectory(path);
}
virtual Status NewLogger(const std::string& fname, Logger** result) {
return target_->NewLogger(fname, result);
}
uint64_t NowMicros() {
return target_->NowMicros();
}
void SleepForMicroseconds(int micros) {
target_->SleepForMicroseconds(micros);
}
private:
Env* target_;
};
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_

View File

@@ -0,0 +1,70 @@
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A database can be configured with a custom FilterPolicy object.
// This object is responsible for creating a small filter from a set
// of keys. These filters are stored in leveldb and are consulted
// automatically by leveldb to decide whether or not to read some
// information from disk. In many cases, a filter can cut down the
// number of disk seeks form a handful to a single disk seek per
// DB::Get() call.
//
// Most people will want to use the builtin bloom filter support (see
// NewBloomFilterPolicy() below).
#ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
#define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
#include <string>
namespace leveldb {
class Slice;
class FilterPolicy {
public:
virtual ~FilterPolicy();
// Return the name of this policy. Note that if the filter encoding
// changes in an incompatible way, the name returned by this method
// must be changed. Otherwise, old incompatible filters may be
// passed to methods of this type.
virtual const char* Name() const = 0;
// keys[0,n-1] contains a list of keys (potentially with duplicates)
// that are ordered according to the user supplied comparator.
// Append a filter that summarizes keys[0,n-1] to *dst.
//
// Warning: do not change the initial contents of *dst. Instead,
// append the newly constructed filter to *dst.
virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
const = 0;
// "filter" contains the data appended by a preceding call to
// CreateFilter() on this class. This method must return true if
// the key was in the list of keys passed to CreateFilter().
// This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
};
// Return a new filter policy that uses a bloom filter with approximately
// the specified number of bits per key. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate.
//
// Callers must delete the result after any database that is using the
// result has been closed.
//
// Note: if you are using a custom comparator that ignores some parts
// of the keys being compared, you must not use NewBloomFilterPolicy()
// and must provide your own FilterPolicy that also ignores the
// corresponding parts of the keys. For example, if the comparator
// ignores trailing spaces, it would be incorrect to use a
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
}
#endif // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_

View File

@@ -0,0 +1,100 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// An iterator yields a sequence of key/value pairs from a source.
// The following class defines the interface. Multiple implementations
// are provided by this library. In particular, iterators are provided
// to access the contents of a Table or a DB.
//
// Multiple threads can invoke const methods on an Iterator without
// external synchronization, but if any of the threads may call a
// non-const method, all threads accessing the same Iterator must use
// external synchronization.
#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
#include "leveldb/slice.h"
#include "leveldb/status.h"
namespace leveldb {
class Iterator {
public:
Iterator();
virtual ~Iterator();
// An iterator is either positioned at a key/value pair, or
// not valid. This method returns true iff the iterator is valid.
virtual bool Valid() const = 0;
// Position at the first key in the source. The iterator is Valid()
// after this call iff the source is not empty.
virtual void SeekToFirst() = 0;
// Position at the last key in the source. The iterator is
// Valid() after this call iff the source is not empty.
virtual void SeekToLast() = 0;
// Position at the first key in the source that at or past target
// The iterator is Valid() after this call iff the source contains
// an entry that comes at or past target.
virtual void Seek(const Slice& target) = 0;
// Moves to the next entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the last entry in the source.
// REQUIRES: Valid()
virtual void Next() = 0;
// Moves to the previous entry in the source. After this call, Valid() is
// true iff the iterator was not positioned at the first entry in source.
// REQUIRES: Valid()
virtual void Prev() = 0;
// Return the key for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: Valid()
virtual Slice key() const = 0;
// Return the value for the current entry. The underlying storage for
// the returned slice is valid only until the next modification of
// the iterator.
// REQUIRES: !AtEnd() && !AtStart()
virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status.
virtual Status status() const = 0;
// Clients are allowed to register function/arg1/arg2 triples that
// will be invoked when this iterator is destroyed.
//
// Note that unlike all of the preceding methods, this method is
// not abstract and therefore clients should not override it.
typedef void (*CleanupFunction)(void* arg1, void* arg2);
void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
private:
struct Cleanup {
CleanupFunction function;
void* arg1;
void* arg2;
Cleanup* next;
};
Cleanup cleanup_;
// No copying allowed
Iterator(const Iterator&);
void operator=(const Iterator&);
};
// Return an empty iterator (yields nothing).
extern Iterator* NewEmptyIterator();
// Return an empty iterator with the specified status.
extern Iterator* NewErrorIterator(const Status& status);
} // namespace leveldb
#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_

Some files were not shown because too many files have changed in this diff Show More