Skip to content

Instantly share code, notes, and snippets.

@simon-mo
Created April 18, 2023 05:10
Show Gist options
  • Save simon-mo/d81ce889450dd3cbdb81fd282cbfd611 to your computer and use it in GitHub Desktop.
Save simon-mo/d81ce889450dd3cbdb81fd282cbfd611 to your computer and use it in GitHub Desktop.
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Ambry</title>
</titleStmt>
<publicationStmt>
<publisher>ACM</publisher>
<availability status="unknown"><p>Copyright ACM</p>
</availability>
<date type="published" when="2016-06-14">2016-06-14</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName coords="1,140.25,118.05,93.04,11.06"><forename type="first">Shadi</forename><forename type="middle">A</forename><surname>Noghabi</surname></persName>
<affiliation key="aff0">
<note type="raw_affiliation"><label>1</label> University of Illinois at Urbana-Champaign,</note>
<orgName type="institution">University of Illinois at Urbana-Champaign</orgName>
</affiliation>
</author>
<author>
<persName coords="1,244.08,118.05,109.56,11.06"><forename type="first">Sriram</forename><surname>Subramanian</surname></persName>
<email>srsubramanian@linkedin.com</email>
<affiliation key="aff1">
<note type="raw_affiliation"><label>2</label> LinkedIn Corp.</note>
<orgName type="institution">LinkedIn Corp</orgName>
</affiliation>
</author>
<author>
<persName coords="1,364.44,118.05,100.88,11.06"><forename type="first">Priyesh</forename><surname>Narayanan</surname></persName>
<email>pnarayanan@linkedin.com</email>
<affiliation key="aff1">
<note type="raw_affiliation"><label>2</label> LinkedIn Corp.</note>
<orgName type="institution">LinkedIn Corp</orgName>
</affiliation>
</author>
<author>
<persName coords="1,105.68,130.50,113.27,11.06"><forename type="first">Sivabalan</forename><surname>Narayanan</surname></persName>
<email>snarayanan@linkedin.com</email>
</author>
<author>
<persName coords="1,229.75,130.50,107.81,11.06"><forename type="first">Gopalakrishna</forename><surname>Holla</surname></persName>
<email>gholla@linkedin.com</email>
<affiliation key="aff1">
<note type="raw_affiliation"><label>2</label> LinkedIn Corp.</note>
<orgName type="institution">LinkedIn Corp</orgName>
</affiliation>
</author>
<author>
<persName coords="1,348.36,130.50,87.03,11.06"><forename type="first">Mammad</forename><surname>Zadeh</surname></persName>
<email>mzadeh@linkedin.com</email>
<affiliation key="aff1">
<note type="raw_affiliation"><label>2</label> LinkedIn Corp.</note>
<orgName type="institution">LinkedIn Corp</orgName>
</affiliation>
</author>
<author>
<persName coords="1,446.19,130.50,53.69,11.06"><forename type="first">Tianwei</forename><surname>Li</surname></persName>
<email>tili@linkedin.com</email>
<affiliation key="aff1">
<note type="raw_affiliation"><label>2</label> LinkedIn Corp.</note>
<orgName type="institution">LinkedIn Corp</orgName>
</affiliation>
</author>
<author>
<persName coords="1,214.91,142.80,74.97,11.06"><forename type="first">Indranil</forename><surname>Gupta</surname></persName>
<affiliation key="aff0">
<note type="raw_affiliation"><label>1</label> University of Illinois at Urbana-Champaign,</note>
<orgName type="institution">University of Illinois at Urbana-Champaign</orgName>
</affiliation>
<affiliation key="aff1">
<note type="raw_affiliation"><label>2</label> LinkedIn Corp.</note>
<orgName type="institution">LinkedIn Corp</orgName>
</affiliation>
</author>
<author>
<persName coords="1,300.67,142.80,89.99,11.06"><forename type="first">Roy</forename><forename type="middle">H</forename><surname>Campbell</surname></persName>
<affiliation key="aff0">
<note type="raw_affiliation"><label>1</label> University of Illinois at Urbana-Champaign,</note>
<orgName type="institution">University of Illinois at Urbana-Champaign</orgName>
</affiliation>
</author>
<author>
<affiliation key="aff2">
<note type="raw_affiliation">SIGMOD&apos;16, June 26-July 01, 2016, San Francisco, CA, USA</note>
<orgName type="department">SIGMOD&apos;16</orgName>
<address>
<addrLine>June 26-July 01</addrLine>
<postCode>2016</postCode>
<settlement>San Francisco</settlement>
<region>CA</region>
<country key="US">USA</country>
</address>
</affiliation>
</author>
<title level="a" type="main">Ambry</title>
</analytic>
<monogr>
<title level="m">Proceedings of the 2016 International Conference on Management of Data</title>
<meeting>the 2016 International Conference on Management of Data </meeting>
<imprint>
<publisher>ACM</publisher>
<date type="published" when="2016-06-14" />
</imprint>
</monogr>
<idno type="MD5">F0697DECED3003C6830CEF8390D403FE</idno>
<idno type="DOI">10.1145/2882903.2903738</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.7.3-SNAPSHOT" ident="GROBID" when="2023-04-18T04:30+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<textClass>
<keywords>
<term>Object Store</term>
<term>Geographically Distributed</term>
<term>Scalable</term>
<term>Load Balancing</term>
</keywords>
</textClass>
<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p><s coords="1,53.80,230.57,239.11,7.86;1,53.80,240.20,239.11,7.86;1,53.80,249.82,157.44,7.86">The infrastructure beneath a worldwide social network has to continually serve billions of variable-sized media objects such as photos, videos, and audio clips.</s><s coords="1,215.28,249.82,77.63,7.86;1,53.80,259.44,239.11,7.86;1,53.80,269.07,239.11,7.86;1,53.80,278.69,76.24,7.86">These objects must be stored and served with low latency and high throughput by a system that is geo-distributed, highly scalable, and load-balanced.</s><s coords="1,135.86,278.69,157.04,7.86;1,53.80,288.31,222.16,7.86">Existing file systems and object stores face several challenges when serving such large objects.</s><s coords="1,280.10,288.31,12.80,7.86;1,53.80,297.94,239.10,7.86;1,53.80,307.56,129.09,7.86">We present Ambry, a production-quality system for storing large immutable data (called blobs).</s><s coords="1,192.54,307.56,100.37,7.86;1,53.80,317.19,239.10,7.86;1,53.80,326.81,239.11,7.86;1,53.80,336.43,214.34,7.86">Ambry is designed in a decentralized way and leverages techniques such as logical blob grouping, asynchronous replication, rebalancing mechanisms, zero-cost failure detection, and OS caching.</s><s coords="1,275.26,336.43,17.65,7.86;1,53.80,346.06,239.11,7.86;1,53.80,355.68,239.11,7.86;1,53.80,365.31,142.15,7.86">Ambry has been running in LinkedIn's production environment for the past 2 years, serving up to 10K requests per second across more than 400 million users.</s><s coords="1,200.19,365.31,92.72,7.86;1,53.80,374.93,239.11,7.86;1,53.80,384.55,239.11,7.86;1,53.80,394.18,239.11,7.86;1,53.80,403.80,235.06,7.86">Our experimental evaluation reveals that Ambry offers high efficiency (utilizing up to 88% of the network bandwidth), low latency (less than 50 ms latency for a 1 MB object), and load balancing (improving imbalance of request rate among disks by 8x-10x).</s></p></div>
</abstract>
</profileDesc>
</teiHeader>
<facsimile>
<surface n="1" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="2" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="3" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="4" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="5" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="6" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="7" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="8" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="9" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="10" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="11" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="12" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
<surface n="13" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
</facsimile>
<text xml:lang="en">
<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1." coords="1,53.80,472.17,116.20,10.75">INTRODUCTION</head><p><s coords="1,62.77,485.57,230.14,7.86;1,53.80,495.19,164.77,7.86">During the past decade, social networks have become popular communication channels worldwide.</s><s coords="1,223.96,495.19,68.95,7.86;1,53.80,504.82,239.11,7.86;1,53.80,514.44,239.11,7.86;1,53.80,524.06,26.40,7.86">Hundreds of millions of users continually upload and view billions of diverse massive media objects, from photos and videos to documents.</s><s coords="1,89.61,524.06,203.29,7.86;1,53.80,533.69,239.11,7.86;1,53.80,543.31,144.47,7.86">These large media objects, called blobs, are uploaded once, frequently accessed from all around the world, never modified, and rarely deleted.</s><s coords="1,205.36,543.31,87.55,7.86;1,53.80,552.94,239.11,7.86;1,53.80,562.56,239.11,7.86;1,53.80,572.18,236.95,7.86">LinkedIn, as a global large-scale social network company, has faced the need for a geographically distributed system that stores and retrieves these read-heavy blobs in an efficient and scalable manner.</s></p><p><s coords="1,62.77,581.81,205.26,7.86">Handling blobs poses a number of unique challenges.</s><s coords="1,271.77,581.81,21.93,7.86;1,53.80,591.43,239.10,7.86;1,53.80,601.05,239.11,7.86;1,53.80,610.68,30.99,7.86">First, due to diversity in media types, blob sizes vary significantly from tens of KBs (e.g., profile pictures) to a few GBs (e.g., videos).</s><s coords="1,89.18,610.68,203.73,7.86;1,316.81,216.96,167.33,7.86">The system needs to store both massive blobs and a large number of small blobs efficiently.</s><s coords="1,490.63,216.96,65.29,7.86;1,316.81,226.59,239.11,7.86;1,316.81,236.21,27.71,7.86">Second, there is an ever-growing number of blobs that need to be stored and served.</s><s coords="1,351.55,236.21,204.38,7.86;1,316.81,245.83,224.16,7.86">Currently, LinkedIn serves more than 800 million put and get operations per day (over 120 TB in size).</s><s coords="1,547.49,245.83,8.44,7.86;1,316.81,255.46,239.10,7.86;1,316.81,265.08,151.81,7.86">In the past 12 months, the request rate has almost doubled, from 5k requests/s to 9.5k requests/s.</s><s coords="1,472.65,265.08,83.27,7.86;1,316.81,274.71,239.11,7.86;1,316.81,284.33,100.63,7.86">This rapid growth in requests magnifies the necessity for a linearly scalable system (with low overhead).</s><s coords="1,421.45,284.33,134.48,7.86;1,316.81,293.95,239.11,7.86;1,316.81,303.58,186.19,7.86">Third, the variability in workload and cluster expansions can create unbalanced load, degrading the latency and throughput of the system.</s><s coords="1,507.18,303.58,48.74,7.86;1,316.81,313.20,103.88,7.86">This creates a need for load-balancing.</s><s coords="1,424.75,313.20,131.17,7.86;1,316.81,322.83,210.99,7.86">Finally, users expect the uploading process to be fast, durable, and highly available.</s><s coords="1,532.13,322.83,23.79,7.86;1,316.81,332.45,239.11,7.86;1,316.81,342.07,239.11,7.86;1,316.81,351.70,189.89,7.86">When a user uploads a blob, all his/her friends from all around the globe should be able to see the blob with very low latency, even if parts of the internal infrastructure fail.</s><s coords="1,512.20,351.70,43.71,7.86;1,316.81,361.32,239.10,7.86;1,316.81,370.94,239.11,7.86;1,316.81,380.57,66.33,7.86">To provide these properties, data has to be reliably replicated across the globe in multiple datacenters, while maintaining low latency for each request.</s></p><p><s coords="1,325.78,390.19,230.15,7.86;1,316.81,399.82,239.11,7.86;1,316.81,409.44,239.11,7.86">LinkedIn had its own home-grown solution called Media Server, built using network attached storage filers (for file storage), Oracle database (for metadata), and Solaris boxes.</s><s coords="1,316.81,419.06,152.25,7.86">Media Server had multiple drawbacks.</s><s coords="1,472.97,419.06,82.94,7.86;1,316.81,428.69,239.11,7.86;1,316.81,438.31,239.11,7.86;1,316.81,447.94,17.45,7.86">It faced CPU and IO spikes caused by numerous metadata operations for small objects, was not horizontally scalable, and was very expensive.</s><s coords="1,338.14,447.94,217.78,7.86;1,316.81,457.56,239.12,7.86;1,316.81,467.18,88.23,7.86">Given that LinkedIn was scaling rapidly and the future web content will be largely dominated by media, we needed to find a replacement.</s></p><p><s coords="1,325.78,476.81,230.13,7.86;1,316.81,486.43,239.11,7.86;1,316.81,496.06,147.70,7.86">Several systems have been designed for handling a large amount of data, but none of them satisfactorily meet the requirements and scale LinkedIn needs.</s><s coords="1,468.36,496.06,87.55,7.86;1,316.81,505.68,239.11,7.86">There has been extensive research into distributed file systems <ref type="bibr" coords="1,486.65,505.68,14.32,7.86" target="#b9">[10,</ref><ref type="bibr" coords="1,502.31,505.68,11.77,7.86" target="#b15">16,</ref><ref type="bibr" coords="1,515.41,505.68,11.76,7.86" target="#b21">22,</ref><ref type="bibr" coords="1,528.51,505.68,11.76,7.86" target="#b23">24,</ref><ref type="bibr" coords="1,541.60,505.68,10.74,7.86" target="#b27">28]</ref>.</s><s coords="1,316.81,515.30,239.10,7.86;1,316.81,524.93,162.52,7.86">These systems have a number of limitations when used for storing blobs, as pointed out by <ref type="bibr" coords="1,453.57,524.93,9.71,7.86" target="#b2">[3,</ref><ref type="bibr" coords="1,465.02,524.93,10.74,7.86" target="#b10">11]</ref>.</s><s coords="1,486.16,524.93,69.76,7.86;1,316.81,534.55,239.11,7.86;1,316.81,544.17,239.11,7.86;1,316.81,553.80,38.13,7.86">For instance, the hierarchical directory structure and rich metadata are an overkill for a blob store and impose unnecessary additional overhead.</s></p><p><s coords="1,325.78,563.42,230.15,7.86;1,316.81,573.05,153.07,7.86">Many key value stores <ref type="bibr" coords="1,418.04,563.42,9.72,7.86" target="#b1">[2,</ref><ref type="bibr" coords="1,428.92,563.42,7.16,7.86" target="#b4">5,</ref><ref type="bibr" coords="1,437.22,563.42,7.16,7.86" target="#b7">8,</ref><ref type="bibr" coords="1,445.54,563.42,11.76,7.86" target="#b13">14]</ref> have also been designed for storing a large number of objects.</s><s coords="1,475.50,573.05,80.42,7.86;1,316.81,582.67,239.11,7.86;1,316.81,592.29,198.48,7.86">Although these systems can handle many small objects, they are not optimized for storing large objects (tens of MBs to GBs).</s><s coords="1,523.01,592.29,32.91,7.86;1,316.81,601.92,239.11,7.86;1,316.81,611.54,239.11,7.86;1,316.81,621.17,20.44,7.86">Further, they impose extra overhead for providing consistency guarantees while these are typically not needed for immutable data.</s><s coords="1,342.05,621.17,213.87,7.86;1,316.81,630.79,239.11,7.86;1,316.81,640.41,69.90,7.86">Some examples of these overheads include using vector clocks, conflict resolution mechanism, logging, and central coordinators.</s></p><p><s coords="1,325.78,650.04,230.14,7.86;1,316.81,659.66,239.11,7.86;1,316.81,669.29,172.79,7.86">A few systems have been designed specifically for large immutable objects including Facebook's Haystack <ref type="bibr" coords="1,521.49,659.66,9.72,7.86" target="#b2">[3]</ref> along with f4 <ref type="bibr" coords="1,349.21,669.29,14.32,7.86" target="#b17">[18]</ref> and Twitter's Blob Store <ref type="bibr" coords="1,472.74,669.29,13.49,7.86" target="#b26">[27]</ref>.</s><s coords="1,495.12,669.29,60.80,7.86;1,316.81,678.91,239.11,7.86;1,316.81,688.53,85.02,7.86">However, these systems do not resolve load imbalance, especially when cluster expansions occur.</s></p><p><s coords="1,325.78,698.16,230.14,7.86;1,316.81,707.78,239.11,7.86;2,53.80,57.64,239.11,7.86;2,53.80,67.26,195.91,7.86">In this paper we present Ambry, a production-quality system designed specifically for diverse large and small im-mutable data with read-heavy traffic, where data is written once, and read many times (&gt;95% read traffic).</s><s coords="2,255.26,67.26,37.64,7.86;2,53.80,76.88,157.34,7.86;2,53.80,86.48,239.11,7.89;2,53.80,96.13,239.11,7.86;2,53.80,105.76,239.11,7.86;2,53.80,115.38,76.10,7.86">Ambry is designed with four main goals in mind: 1) Low Latency and High Throughput: The system needs to serve a large number of requests per second in a timely fashion, while working on cheap commodity hardware (e.g., HDDs).</s><s coords="2,134.74,115.38,158.17,7.86;2,53.80,125.00,239.11,7.86;2,53.80,134.63,239.11,7.86;2,53.80,144.25,239.11,7.86;2,53.80,153.88,239.11,7.86;2,53.80,163.50,239.11,7.86;2,53.80,173.12,229.22,7.86">In order to reach this goal, Ambry utilizes a number of techniques including exploiting the OS cache, using zero copy when reading data from disk to network, chunking data along with retrieving/storing chunks in parallel from multiple nodes, providing configurable polices for the number of replicas to write and read, and zero-cost failure detection mechanisms (Sections 2.3, 4.2, and 4.3).</s></p><p><s coords="2,53.80,182.72,239.11,7.89;2,53.80,192.37,240.27,7.86;2,53.80,201.99,239.11,7.86">2) Geo-Distributed Operation: Blobs have to be replicated in other geographically distributed datacenters for high durability and availability, even in the presence of failures.</s><s coords="2,53.80,211.62,239.11,7.86;2,53.80,221.24,239.11,7.86;2,53.80,230.87,239.11,7.86;2,53.80,240.49,97.85,7.86">To achieve low latency and high throughput in this geodistributed setting, Ambry is designed as a decentralized multi-master system where data can be written to or read from any of the replicas.</s><s coords="2,155.70,240.49,137.20,7.86;2,53.80,250.11,239.11,7.86;2,53.80,259.74,173.16,7.86">Additionally, it uses asynchronous writes that write data to the closest datacenter and asynchronously replicate to other datacenter(s).</s><s coords="2,230.94,259.74,61.96,7.86;2,53.80,269.36,239.11,7.86;2,53.80,278.99,239.11,7.86;2,53.80,288.61,169.85,7.86">Also, for higher availability, it uses proxy requests that forward requests to other datacenters when the data is not replicated in the current datacenter yet (Sections 2.3 and 4.2).</s></p><p><s coords="2,53.80,298.21,239.11,7.89;2,53.80,307.86,221.59,7.86">3) Scalability: With the ever-growing amount of data, the system has to scale out efficiently with low overhead.</s><s coords="2,282.41,307.86,10.49,7.86;2,53.80,317.48,239.11,7.86">To achieve this goal, Ambry makes three main design choices.</s><s coords="2,53.80,327.11,239.11,7.86;2,53.80,336.73,239.10,7.86;2,53.80,346.35,215.43,7.86">First, Ambry separates the logical placement of blobs from their physical placement, allowing it to change the physical placement transparently from the logical placement.</s><s coords="2,276.54,346.35,16.37,7.86;2,53.80,355.98,239.11,7.86;2,53.80,365.60,124.57,7.86">Second, Ambry is designed as a completely decentralized system, with no manager/master.</s><s coords="2,183.58,365.60,109.32,7.86;2,53.80,375.22,239.11,7.86;2,53.80,384.85,239.10,7.86;2,53.80,394.47,123.02,7.86">Third, Ambry uses on-disk segmented indexing along with Bloom filters and an inmemory cache of the latest segment, allowing for scalable and efficient indexing of blobs.</s><s coords="2,180.90,394.47,53.68,7.86">(Section 4.3).</s><s coords="2,53.80,404.07,239.11,7.89;2,53.80,413.72,63.68,7.86">4) Load Balancing: The system has to stay balanced in spite of growth.</s><s coords="2,122.62,413.72,170.29,7.86;2,53.80,423.34,239.11,7.86;2,53.80,432.97,239.12,7.86;2,53.80,442.59,239.12,7.86;2,53.80,452.22,10.73,7.86">Ambry uses chunking of large blobs along with a random selection approach to remain balanced in a static cluster, and a re-balancing mechanism to return to a balanced state whenever cluster expansion occurs (Section 3).</s></p><p><s coords="2,62.77,461.84,230.14,7.86;2,53.80,471.46,239.11,7.86;2,53.80,481.09,39.60,7.86">Ambry has successfully been in production for the last 24 months, across four datacenters, serving more than 400 million users.</s><s coords="2,97.17,481.09,195.73,7.86;2,53.80,490.71,239.11,7.86;2,53.80,500.34,239.11,7.86;2,53.80,509.96,239.11,7.86;2,53.80,519.58,239.10,7.86;2,53.80,529.21,152.63,7.86">Our experimental results show that Ambry reaches high throughput (reaching up to 88% of the network bandwidth) and low latency (serving 1 MB blobs in less than 50 ms), works efficiently across multiple geo-distributed datacenters, and improves the imbalance among disks by a factor of 8x-10x while moving minimal data.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2." coords="2,53.80,551.38,139.56,10.75">SYSTEM OVERVIEW</head><p><s coords="2,62.77,564.78,230.14,7.86;2,53.80,574.41,239.11,7.86;2,53.80,584.03,239.12,7.86;2,53.80,593.65,98.48,7.86">In this section we discuss the overall design of Ambry including the high-level architecture of the system (Section 2.1), the notion of partition (Section 2.2), and supported operations (Section 2.3).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1" coords="2,53.80,611.17,91.53,10.75">Architecture</head><p><s coords="2,62.77,624.57,230.14,7.86;2,53.80,634.19,239.11,7.86;2,53.80,643.82,17.47,7.86">Ambry is designed as a completely decentralized multitenant system across geographically distributed data centers.</s><s coords="2,78.81,643.82,214.09,7.86;2,53.80,653.44,23.39,7.86">The overall architecture of Ambry is shown in Figure 1.</s><s coords="2,82.32,653.44,210.59,7.86;2,53.80,663.07,239.11,7.86;2,53.80,672.69,239.11,7.86;2,53.80,682.31,94.60,7.86">The system is composed of three main components: Frontends that receive and route requests, Datanodes that store the actual data, and Cluster Managers that maintain the state of the cluster.</s><s coords="2,152.95,682.31,139.95,7.86;2,53.80,691.94,219.31,7.86">Each datacenter owns and runs its own set of these components in a decentralized fashion.</s><s coords="2,277.04,691.94,15.87,7.86;2,53.80,701.56,239.10,7.86;2,53.80,711.19,239.11,7.86;2,316.81,382.26,60.84,7.86">The Frontends and Datanodes are completely independent of one another, and the Cluster Managers are synchronized using Zookeeper <ref type="bibr" coords="2,360.78,382.26,13.49,7.86" target="#b11">[12]</ref>.</s><s coords="2,381.72,382.26,174.20,7.86;2,316.81,391.88,112.53,7.86">We provide an overview of each component below (details in Section 4).</s></p><p><s coords="2,325.78,401.48,230.15,7.89;2,316.81,411.13,144.51,7.86">Cluster Manager: Ambry organizes its data in virtual units called partitions (Section 2.2).</s><s coords="2,465.38,411.13,90.54,7.86;2,316.81,420.75,239.11,7.86;2,316.81,430.38,39.64,7.86">A partition is a logical grouping of a number of blobs, implemented as a large replicated file.</s><s coords="2,362.86,430.38,193.06,7.86;2,316.81,440.00,204.70,7.86">On creation, partitions are read-write, i.e., immutable blobs are read and new blobs can be added.</s><s coords="2,525.31,440.00,30.61,7.86;2,316.81,449.62,219.32,7.86">When a logical partition reaches its capacity, it turns read-only.</s><s coords="2,540.06,449.62,15.86,7.86;2,316.81,459.25,239.11,7.86;2,316.81,468.87,239.10,7.86;2,316.81,478.49,232.61,7.86">The Cluster Manager keeps track of the state (read-write/readonly) and location of each partition replica, along with the physical layout of the cluster (nodes and disk placement).</s></p><p><s coords="2,325.78,488.09,230.14,7.89;2,316.81,497.74,196.92,7.86">Frontend: The Frontends are in charge of receiving and routing requests in a multi-tenant environment.</s><s coords="2,520.81,497.74,35.12,7.86;2,316.81,507.37,210.84,7.86">The system serves three request types: put, get, and delete.</s><s coords="2,532.01,507.37,23.90,7.86;2,316.81,516.99,239.11,7.86;2,316.81,526.61,76.78,7.86">Popular data is handled by a Content Delivery Network (CDN) layer above Ambry.</s><s coords="2,397.47,526.61,158.45,7.86;2,316.81,536.24,215.63,7.86">Frontends receive requests directly from clients or through the CDN (if the data is cached).</s><s coords="2,540.06,536.24,15.87,7.86;2,316.81,545.86,239.11,7.86;2,316.81,555.49,239.10,7.86;2,316.81,565.11,62.73,7.86">The Frontends forward a request to the corresponding Datanode(s) and return the response to the client/CDN originating the request.</s></p><p><s coords="2,325.78,574.71,230.14,7.89">Datanode: Datanodes store and retrieve the actual data.</s><s coords="2,316.81,584.36,175.37,7.86">Each Datanode manages a number of disks.</s><s coords="2,496.21,584.36,59.71,7.86;2,316.81,593.98,239.11,7.86;2,316.81,603.61,239.11,7.86;2,316.81,613.23,79.32,7.86">For better performance, Datanodes maintain a number of additional data structures including: indexing of blobs, journals and Bloom filters (Section 4.3).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2" coords="2,316.81,630.42,72.60,10.75">Partition</head><p><s coords="2,325.78,643.82,230.14,7.86;2,316.81,653.44,239.11,7.86;2,316.81,663.07,195.34,7.86">Instead of directly mapping blobs to physical machines, e.g., Chord <ref type="bibr" coords="2,365.01,653.44,14.31,7.86" target="#b25">[26]</ref> and CRUSH <ref type="bibr" coords="2,436.28,653.44,13.49,7.86" target="#b28">[29]</ref>, Ambry randomly groups blobs together into virtual units called partitions.</s><s coords="2,516.05,663.07,39.87,7.86;2,316.81,672.69,239.11,7.86;2,316.81,682.31,42.24,7.86">The physical placement of partitions on machines is done in a separate procedure.</s><s coords="2,362.86,682.31,193.06,7.86;2,316.81,691.94,239.11,7.86;2,316.81,701.56,239.11,7.86;2,316.81,711.19,71.97,7.86">This decoupling of the logical and physical placement enables transparent data movement (necessary for rebalancing) and avoids immediate rehashing of data during cluster expansion.</s></p><p><s coords="3,62.77,57.64,230.14,7.86;3,53.80,67.26,76.24,7.86">A partition is implemented as an append-only log in a preallocated large file.</s><s coords="3,134.56,67.26,158.35,7.86;3,53.80,76.88,122.45,7.86;3,179.59,75.12,3.65,5.24;3,183.74,76.88,2.56,7.86">Currently, partitions are fixed-size during the life-time of the system 1 .</s><s coords="3,191.22,76.88,101.69,7.86;3,53.80,86.51,239.10,7.86;3,53.80,96.13,239.10,7.86;3,53.80,105.76,239.11,7.86;3,53.80,115.38,26.59,7.86">The partition size should be large enough that the overhead of partitions, i.e., the additional data structures maintained per partition such as indexing, journals, and Bloom filters (Section 4.3), are negligible.</s><s coords="3,84.73,115.38,208.19,7.86;3,53.80,125.00,90.58,7.86">On the other hand, the failure recovery and rebuild time should be small.</s><s coords="3,152.78,125.00,140.12,7.86;3,53.80,134.63,32.86,7.86">We use 100 GB partitions in our clusters.</s><s coords="3,92.30,134.63,200.61,7.86;3,53.80,144.25,239.11,7.86;3,53.80,153.88,69.10,7.86">Since rebuilding is done in parallel from multiple replicas, we found that even 100 GB partitions can be rebuilt in a few minutes.</s></p><p><s coords="3,62.77,163.50,230.15,7.86;3,53.80,173.12,104.39,7.86">Blobs are sequentially written to partitions as put and delete entries (Figure <ref type="figure" coords="3,147.45,173.12,3.58,7.86" target="#fig_0">2</ref>).</s><s coords="3,167.21,173.12,125.70,7.86;3,53.80,182.75,219.21,7.86">Both entries contain a header (storing the offsets of fields in the entry) and a blob id.</s><s coords="3,277.04,182.75,15.87,7.86;3,53.80,192.37,239.11,7.86;3,53.80,201.99,239.11,7.86;3,53.80,211.62,107.53,7.86">The blob id is a unique identifier, generated by the Frontend during a put operation, and used during get/delete operations for locating the blob.</s><s coords="3,165.50,211.62,127.41,7.86;3,53.80,221.24,239.12,7.86;3,53.80,230.87,195.61,7.86">This id consists of the partition id in which the blob is placed (8 Bytes), followed by a 32 Byte universally unique id (UUID) for the blob.</s><s coords="3,254.46,230.87,38.45,7.86;3,53.80,240.49,239.11,7.86;3,53.80,251.39,14.96,7.86;3,68.76,249.62,16.71,5.24;3,85.97,251.39,6.13,7.86">Collisions in blob ids are possible, but very unlikely (the probability is &lt; 2 −320 ).</s><s coords="3,96.56,251.39,196.34,7.86;3,53.80,261.01,239.10,7.86;3,53.80,270.64,19.94,7.86">For a collision to occur, two put operations have to generate equal UUIDs and chose similar partitions for the blob.</s><s coords="3,77.71,270.64,215.19,7.86;3,53.80,280.26,66.30,7.86">Collisions are handled at the Datanodes by failing the late put request.</s></p><p><s coords="3,62.77,289.88,230.14,7.86;3,53.80,299.51,215.19,7.86">Put entries also include predefined properties including: blob size, time-to-live, creation time, and content type.</s><s coords="3,272.74,299.51,20.25,7.86;3,53.80,309.13,239.11,7.86;3,53.80,318.76,48.57,7.86">Also, there is an optional map of user defined properties followed by the blob.</s></p><p><s coords="3,62.77,328.38,230.15,7.86;3,53.80,338.00,188.42,7.86">In order to offer high availability and fault-tolerance, each partition is replicated on multiple Datanodes.</s><s coords="3,248.91,338.00,44.00,7.86;3,53.80,347.63,239.10,7.86;3,53.80,357.25,27.72,7.86">For replica placement, Ambry uses a greedy approach based on disk spaces.</s><s coords="3,86.63,357.25,206.27,7.86;3,53.80,366.88,239.11,7.86;3,53.80,376.50,239.11,7.86;3,53.80,386.12,132.69,7.86">This algorithm chooses the disk with the most unallocated space while ensuring constraints such as: 1) not having more than one replica per Datanode and 2) having replicas in multiple data centers.</s><s coords="3,191.41,386.12,101.49,7.86;3,53.80,395.75,239.11,7.86;3,53.80,405.37,26.13,7.86">Currently, the number of replicas per partition is configurable by the system administrator.</s><s coords="3,83.82,405.37,209.08,7.86;3,53.80,415.00,239.11,7.86;3,53.80,424.62,239.11,7.86;3,53.80,434.24,116.39,7.86">As part of future work, we plan to adaptively change the number of replicas based on the popularity of the partition, and use erasure coding for cold data to even further reduce the replication factor.</s></p><p><s coords="3,62.77,443.87,230.14,7.86;3,53.80,453.49,104.89,7.86">On creation, partitions are read-write, serving all operations (put, get and delete).</s><s coords="3,162.45,453.49,130.45,7.86;3,53.80,463.11,239.10,7.86;3,53.80,472.74,198.09,7.86">When the partition hits its upper threshold on size (capacity threshold) it becomes read-only, thereafter serving only get and delete operations.</s></p><p><s coords="3,62.77,482.36,230.14,7.86;3,53.80,491.99,239.11,7.86">The capacity threshold should be slightly less than the max capacity (80-90%) of the partition for two reasons.</s><s coords="3,53.80,501.61,239.11,7.86;3,53.80,511.23,239.11,7.86;3,53.80,520.86,122.82,7.86">First, after becoming read-only, replicas might not be completely in-sync and need free space to catch-up later (because of asynchronous writes).</s><s coords="3,181.33,520.86,111.58,7.86;3,53.80,530.48,88.04,7.86">Second, delete requests still append delete entries.</s></p><p><s coords="3,62.77,540.11,230.14,7.86;3,53.80,549.73,36.09,7.86">Deletes are similar to put operations, but on an existing blob.</s><s coords="3,96.45,549.73,196.45,7.86;3,53.80,559.35,239.11,7.86">By default, deletes result in appending a delete entry (with the delete flag set) for the blob (soft delete).</s><s coords="3,53.80,568.98,239.10,7.86;3,53.80,578.60,95.95,7.86">Deleted blobs are periodically cleaned up using an in-place compaction mechanism.</s><s coords="3,153.93,578.60,138.98,7.86;3,53.80,588.23,226.42,7.86">After compaction, read-only partitions can become read-write if enough space is freed-up.</s><s coords="3,284.47,588.23,8.44,7.86;3,53.80,597.85,239.10,7.86;3,53.80,607.47,157.12,7.86">In the rest of the paper we mainly focus on puts, due to the similarity of delete and put operations.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3" coords="3,53.80,624.52,84.02,10.75">Operations</head><p><s coords="3,62.77,637.93,230.14,7.86;3,53.80,647.55,114.11,7.86">Ambry has a lightweight API supporting only 3 operations: put, get, and delete.</s><s coords="3,175.64,647.55,117.27,7.86;3,53.80,657.17,239.11,7.86;3,53.80,666.80,239.11,7.86;3,53.80,676.42,31.54,7.86">The request handling procedure is shown in Figure <ref type="figure" coords="3,160.26,657.17,3.58,7.86">3</ref>. On receiving a request, the Frontend optionally conducts some security checks on the request.</s><s coords="3,89.60,676.42,203.31,7.86;3,53.80,686.05,239.11,7.86;3,54.25,700.45,3.65,5.24;3,58.40,702.22,234.50,7.86;3,53.80,711.19,189.23,7.86">Then, using the Router Library (that contains the core logic of operation handling) it chooses a partition, com- 1 As part of future work we plan to investigate potential improvements by using variable-size partitions.</s><s coords="3,316.81,236.59,239.11,7.86;3,316.81,246.21,31.54,7.86">municates with the Datanode(s) in charge, and serves the request.</s><s coords="3,354.10,246.21,201.83,7.86;3,316.81,255.83,239.10,7.86;3,316.81,265.46,213.59,7.86">In the put operation, the partition is chosen randomly (for data balancing purposes), and in the get/delete operation the partition is extracted from the blob id.</s></p><p><s coords="3,325.78,275.08,230.14,7.86;3,316.81,284.70,199.79,7.86">Operations are handled in a multi-master design where operations can be served by any of the replicas.</s><s coords="3,523.70,284.70,32.22,7.86;3,316.81,294.33,239.11,7.86;3,316.81,303.95,64.81,7.86">The decision of how many replicas to contact is based on userdefined policies.</s><s coords="3,389.02,303.95,166.89,7.86;3,316.81,313.58,239.11,7.86;3,316.81,323.20,217.54,7.86">These policies are similar to consistency levels in Cassandra <ref type="bibr" coords="3,395.66,313.58,13.49,7.86" target="#b13">[14]</ref>, where they control how many (one, k, majority, all) replicas to involve in an operation.</s><s coords="3,542.47,323.20,13.45,7.86;3,316.81,332.82,239.11,7.86;3,316.81,342.45,239.11,7.86;3,316.81,352.07,221.71,7.86">For puts (or deletes), the request is forwarded to all replicas, and policies define the number of acknowledgments needed for a success (trade-off between durability and latency).</s><s coords="3,542.47,352.07,13.45,7.86;3,316.81,361.70,239.10,7.86;3,316.81,371.32,239.11,7.86;3,316.81,380.94,77.29,7.86">For gets, policies determine how many randomly selected replicas to contact for the operation (trade-off between resources usage and latency).</s><s coords="3,398.07,380.94,157.86,7.86;3,316.81,390.57,239.11,7.86">In practice, we found that for all operations the k = 2 replica policy gives us the balance we desire.</s><s coords="3,316.81,400.19,239.11,7.86;3,316.81,409.82,149.34,7.86">Stricter polices (involving more replicas) can be used to provide stronger consistency guarantees.</s></p><p><s coords="3,325.78,419.44,230.14,7.86;3,316.81,429.06,239.11,7.86;3,316.81,438.69,209.00,7.86">Additionally, performing write operations to all replicas placed in multiple geo-distributed datacenters in a synchronous fashion can affect the latency and throughput.</s><s coords="3,532.34,438.69,23.59,7.86;3,316.81,448.31,239.10,7.86;3,316.81,457.93,239.11,7.86;3,316.81,467.56,239.10,7.86;3,316.81,477.18,124.08,7.86">In order to alleviate this issue, Ambry uses asynchronous writes where puts are performed synchronously only in the local datacenter, i.e., the datacenter in which the Frontend receiving the request is located.</s><s coords="3,448.39,477.18,107.53,7.86;3,316.81,486.81,141.70,7.86">The request is counted as successfully finished at this point.</s><s coords="3,467.06,486.81,88.85,7.86;3,316.81,496.43,239.11,7.86;3,316.81,506.05,110.44,7.86">Later on, the blob is replicated to other datacenters using a lightweight replication algorithm (Section 5).</s></p><p><s coords="3,325.78,515.68,230.14,7.86;3,316.81,525.30,239.11,7.86;3,316.81,534.93,239.11,7.86;3,316.81,544.55,60.95,7.86">In order to provide read-after-write consistency in a datacenter which a blob has not been replicated yet (e.g., writing to one datacenter and reading from another), Ambry uses proxy requests.</s><s coords="3,381.82,544.55,174.10,7.86;3,316.81,554.17,239.11,7.86;3,316.81,563.80,163.69,7.86">If the Frontend cannot retrieve a blob from its local datacenter, it proxies the request to another datacenter and returns the result from there.</s><s coords="3,484.73,563.80,71.19,7.86;3,316.81,573.42,239.11,7.86;3,316.81,583.05,238.76,7.86">Although a proxy request is expensive, in practice we found that proxy requests happen infrequently (less than 0.001 % of the time).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3." coords="3,316.81,601.54,130.67,10.75">LOAD BALANCING</head><p><s coords="3,325.78,614.95,230.15,7.86;3,316.81,624.57,239.11,7.86;3,316.81,634.19,107.80,7.86">Skewed workloads, massively large blobs, and cluster expansions create load imbalance and impact the throughput and latency of the system.</s><s coords="3,429.89,634.19,126.03,7.86;3,316.81,643.82,239.11,7.86;3,316.81,653.44,115.47,7.86">Ambry achieves load balancing (in terms of disk usage and request rates) in both static and dynamic (scale-out) clusters.</s></p><p><s coords="3,325.78,663.04,230.14,7.89;3,316.81,672.69,239.11,7.86;3,316.81,682.31,212.52,7.86">Static Cluster: Splitting large blobs into multiple small chunks (Section 4.2.1) as well as routing put operations to random partitions, achieves balance in partition sizes.</s><s coords="3,533.17,682.31,22.75,7.86;3,316.81,691.94,239.11,7.86;3,316.81,701.56,239.10,7.86;3,316.81,711.19,126.75,7.86">Additionally, using fairly large partition sizes along with relying on CDNs to handle very popular data significantly decrease the likelihood of hot partitions.</s><s coords="3,448.13,711.19,107.79,7.86;4,53.80,57.64,239.11,7.86;4,53.80,67.26,200.03,7.86">Using these techniques the load imbalance of request rates and partition sizes in production gets to as low as 5% amongst Datanodes.</s></p><p><s coords="4,62.77,76.86,230.14,7.89;4,53.80,86.51,239.11,7.86;4,53.80,96.13,106.82,7.86">Dynamic Cluster: In practice, read-write partitions receive all the write traffic and also the majority of the read traffic (due to popularity).</s><s coords="4,164.89,96.13,128.02,7.86;4,53.80,105.76,239.11,7.86;4,53.80,115.38,167.51,7.86">Since partitions grow in a semibalanced manner, the number of read-write partitions becomes the main factor of load imbalance.</s><s coords="4,226.41,115.38,66.50,7.86;4,53.80,125.00,239.10,7.86;4,53.80,134.63,239.11,7.86">After cluster expansion, new Datanodes contain only read-write partitions, while older Datanodes contain mostly read-only partitions.</s><s coords="4,53.80,144.25,239.11,7.86;4,53.80,153.88,120.71,7.86">This skewed distribution of read-write partitions creates a large imbalance in the system.</s><s coords="4,178.46,153.88,114.45,7.86;4,53.80,163.50,239.10,7.86;4,53.80,173.12,239.11,7.86;4,53.80,182.75,20.00,7.86">In our initial version, the average request rates of new Datanodes were up to 100x higher than old Datanodes and 10x higher than the average-aged ones.</s></p><p><s coords="4,62.77,192.37,231.61,7.86;4,53.80,201.99,239.11,7.86;4,53.80,211.62,239.11,7.86;4,53.80,221.24,43.50,7.86">To alleviate this issue, Ambry employs a rebalancing mechanism that returns the cluster to a semi-balanced state (in terms of disk usage and request rate) with minimal data movement.</s><s coords="4,102.56,221.24,190.35,7.86;4,53.80,230.87,234.61,7.86">The rebalancing approach reduces request rate and disk usage imbalance by 6-10x and 9-10x respectively.</s></p><p><s coords="4,62.77,240.49,230.14,7.86;4,53.80,250.11,239.10,7.86;4,53.80,259.74,239.11,7.86;4,53.80,269.36,195.30,7.86">Ambry defines the ideal (load balanced) state as a triplet (idealRW, idealRO, idealUsed) representing the ideal number of read-write partitions, ideal number of read-only partitions and ideal disk usage each disk should have.</s><s coords="4,253.13,269.36,39.78,7.86;4,53.80,278.99,239.11,7.86;4,53.80,288.61,239.11,7.86;4,53.80,298.23,239.11,7.86;4,53.80,307.86,48.94,7.86">This ideal state (idealRW, idealRO, idealUsed) is computed by dividing the total number of read-write/read-only partitions and total used disk space by the number of disks in the cluster, respectively.</s><s coords="4,108.08,307.86,184.83,7.86;4,53.80,317.48,239.11,7.86;4,53.80,327.11,106.45,7.86">A disk is considered above (or below) ideal if it has more (or less) read-write/read-only partitions or disk usage than the ideal state.</s></p><p><s coords="4,62.77,336.73,230.14,7.86;4,53.80,346.35,22.04,7.86">The rebalancing algorithm attempts to reach this ideal state.</s><s coords="4,81.59,346.35,211.32,7.86;4,53.80,355.98,239.11,7.86;4,53.80,365.60,104.10,7.86">This is done by moving partitions from disks above ideal to disks below ideal using a 2 phase approach, as shown in the pseudo-code below.</s><s coords="4,62.77,643.79,230.14,7.89;4,53.80,653.44,239.10,7.86;4,53.80,663.07,134.08,7.86">Phase1 -Move to Partition Pool: In this phase, Ambry moves partitions from disks above ideal into a pool, called partitionPool (Lines 6-13).</s><s coords="4,193.25,663.07,99.65,7.86;4,53.80,672.69,239.11,7.86;4,53.80,682.31,160.08,7.86">At the end of this phase no disk should remain above ideal, unless removing any partition would cause it to fall below ideal.</s></p><p><s coords="4,62.77,691.94,230.14,7.86;4,53.80,701.56,239.10,7.86;4,53.80,711.19,39.96,7.86">Ambry starts from read-write partitions (which are the main factor), and moves extra ones solely based on idealRW threshold.</s><s coords="4,98.73,711.19,194.18,7.86;4,316.81,194.69,239.11,7.86;4,316.81,204.31,100.32,7.86">The same process is repeated for read-only par- titions, but with considering both idealRO and idealUsed when moving partitions.</s><s coords="4,425.10,204.31,130.83,7.86;4,316.81,213.93,239.11,7.86">The strategy of choosing which partition to move is based on minimizing data movement.</s></p><p><s coords="4,316.81,223.56,239.11,7.86;4,316.81,233.18,239.11,7.86;4,316.81,242.81,184.61,7.86">For read-write partitions, the one with the minimum used capacity is chosen, while for read-only partitions, a random one is chosen since all such partitions are full.</s></p><p><s coords="4,325.78,252.40,230.14,7.89;4,316.81,262.05,239.11,7.86;4,316.81,271.68,239.11,7.86;4,316.81,281.30,124.08,7.86">Phase2 -Place Partitions on Disks: In this phase, Ambry places partitions from the partition pool on disks below ideal (Lines 14-16), starting from read-write partitions and then read-only ones.</s><s coords="4,446.50,281.30,109.42,7.86;4,316.81,290.93,184.37,7.86">Partitions are placed using a random round-robin approach (Line 17 <ref type="bibr" coords="4,481.47,290.93,15.77,7.86">-22)</ref>.</s><s coords="4,505.80,290.93,50.11,7.86;4,316.81,300.55,239.11,7.86;4,316.81,310.17,125.12,7.86">Ambry finds all disks below ideal, shuffles them, and assigns partitions to them in a round-robin fashion.</s><s coords="4,447.18,310.17,108.74,7.86;4,316.81,319.80,121.50,7.86">This procedure is repeated until the pool becomes empty.</s></p><p><s coords="4,325.78,329.42,230.15,7.86;4,316.81,339.04,239.11,7.86;4,316.81,348.67,239.11,7.86;4,316.81,358.29,239.11,7.86;4,316.81,367.92,149.91,7.86">After finding the the new placement, replicas are seamlessly moved by: 1) creating a new replica in the destination, 2) syncing the new replica with old ones using the replication protocol while serving new writes in all replicas, and 3) deleting the old replica after syncing.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4." coords="4,316.81,389.01,170.62,10.75">COMPONENTS IN DETAIL</head><p><s coords="4,325.78,402.41,230.14,7.86;4,316.81,412.03,29.70,7.86">In this section we further discuss the main components of Ambry.</s><s coords="4,350.40,412.03,205.52,7.86;4,316.81,421.66,239.11,7.86;4,316.81,431.28,239.12,7.86;4,316.81,440.91,239.11,7.86;4,316.81,450.53,17.89,7.86">We describe the detailed state stored by the Cluster Manager (Section 4.1), extra responsibilities of Frontends including chunking and failure detection (Section 4.2), and additional structures maintained by the Datanodes (Section 4.3).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1" coords="4,316.81,467.50,114.22,10.75">Cluster Manager</head><p><s coords="4,325.78,480.91,230.14,7.86;4,316.81,490.53,53.88,7.86">The Cluster Manager is in charge of maintaining the state of the cluster.</s><s coords="4,374.49,490.53,181.43,7.86;4,316.81,500.15,217.71,7.86">Each datacenter has its local Cluster Manager instance(s) kept in-sync with others using Zookeeper.</s><s coords="4,540.05,500.15,15.87,7.86;4,316.81,509.78,239.11,7.86;4,316.81,519.40,239.11,7.86;4,316.81,529.03,27.36,7.86">The state stored by the Cluster Manager is very small (less than a few MBs in total), consisting of a hardware and logical layout.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.1" coords="4,319.55,545.08,110.30,9.44">Hardware Layout</head><p><s coords="4,325.78,557.55,230.14,7.86;4,316.81,567.17,239.10,7.86;4,316.81,576.80,126.01,7.86">The hardware layout includes information about the physical structure of the cluster, i.e., the arrangement of datacenters, Datanodes, and disks.</s><s coords="4,450.29,576.80,105.63,7.86;4,316.81,586.42,239.11,7.86;4,316.81,596.05,51.83,7.86">It also maintains the raw capacity and status, i.e., healthy (UP) or failed (DOWN), for each disk.</s><s coords="4,372.44,596.05,183.47,7.86;4,316.81,605.67,7.16,7.86">An example hardware layout is shown in Table <ref type="table" coords="4,316.81,605.67,3.58,7.86" target="#tab_1">1</ref>.</s><s coords="4,328.01,605.67,227.92,7.86;4,316.81,615.29,239.11,7.86;4,316.81,624.92,111.96,7.86">As shown, Ambry works in a heterogeneous environment with different hardware and configuration used inside and across different datacenters.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.2" coords="4,319.55,640.97,99.44,9.44">Logical Layout</head><p><s coords="4,325.78,653.44,230.14,7.86;4,316.81,663.07,239.11,7.86;4,316.81,672.69,37.85,7.86">The logical layout maintains the physical location of partition replicas, and the state (read-only/read-write) of each partition.</s><s coords="4,358.66,672.69,197.26,7.86;4,316.81,682.31,239.11,7.86;4,316.81,691.94,142.23,7.86">In order to find the state of a partition, the Cluster Manager periodically contacts the Datanodes, and requests the state of their partitions.</s><s coords="4,464.24,691.94,91.67,7.86;4,316.81,701.56,239.11,7.86;4,316.81,711.19,239.11,7.86">This layout is used for choosing a partition to write a new blob to (put operation), and locating the Datanode in charge of a given replica (all</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2" coords="5,53.80,260.47,107.92,10.75">Frontend Layer</head><p><s coords="5,62.77,273.87,230.14,7.86;5,53.80,283.50,27.47,7.86">The Frontend is the entry point to Ambry for external requests.</s><s coords="5,85.27,283.50,182.01,7.86">Each datacenter has its own set of Frontends.</s><s coords="5,271.28,283.50,21.63,7.86;5,53.80,293.12,239.11,7.86;5,53.80,302.74,239.11,7.86;5,53.80,312.37,239.11,7.86;5,53.80,321.99,30.68,7.86">Frontends are decentralized involving no master or coordination, identically performing the same task, and stateless with all state stored in the Cluster Manager (which is periodically pulled).</s><s coords="5,92.46,321.99,200.45,7.86;5,53.80,331.62,239.11,7.86;5,53.80,341.24,239.12,7.86;5,53.80,350.86,239.11,7.86;5,53.80,360.49,56.48,7.86">This design enhances scalability (new Frontends can be added without much performance penalty), faulttolerance (requests can be forwarded to any Frontend), and failure recovery (failed Frontends can quickly be replaced) for Frontends.</s><s coords="5,114.36,360.49,172.49,7.86">Frontends have three main responsibilities:</s></p><p><s coords="5,64.58,376.43,228.33,7.86;5,76.21,386.06,216.70,7.86;5,76.21,395.68,216.69,7.86;5,76.21,405.31,52.55,7.86">1. Request Handling: This involves receiving requests, routing them to the corresponding Datanode(s) using the Router Library (Section 4.2.1), and sending back the response.</s><s coords="5,64.58,417.85,232.81,7.86;5,76.21,427.47,216.69,7.86">2. Security Checks: Optionally performing security checks, such as virus scanning and authentication on requests.</s></p><p><s coords="5,64.58,445.39,228.33,7.86;5,76.21,455.01,216.69,7.86;5,76.21,464.63,199.93,7.86">3. Capturing Operations: Pushing events to a change capture system out of Ambry for further offline analysis, such as finding request patterns of the system.</s><s coords="5,280.10,464.63,12.81,7.86;5,76.21,474.26,216.69,7.86;5,76.21,483.88,216.69,7.86;5,76.21,493.50,35.88,7.86">We use Kafka <ref type="bibr" coords="5,117.27,474.26,14.32,7.86" target="#b12">[13]</ref> as our change-capture system due to the high durability, high throughput, and low overhead it provides.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.1" coords="5,56.54,511.09,98.77,9.44">Router Library</head><p><s coords="5,62.77,523.56,230.14,7.86;5,53.80,533.19,210.94,7.86">The Router Library contains all the core logic of handling requests and communicating with Datanodes.</s><s coords="5,271.28,533.19,21.63,7.86;5,53.80,542.81,161.35,7.86">Frontends simply embed and use this library.</s><s coords="5,219.11,542.81,73.80,7.86;5,53.80,552.44,239.12,7.86;5,53.80,562.06,35.17,7.86">Clients can bypass Frontends by embedding this library and directly serving requests.</s><s coords="5,97.01,562.06,195.89,7.86;5,53.80,571.68,239.11,7.86;5,53.80,581.31,123.38,7.86">This library includes four main procedures: 1) policy-based routing, 2) chunking large blobs, 3) failure detection, and 4) proxy requests.</s></p><p><s coords="5,53.80,600.53,239.11,7.89;5,53.80,610.18,239.10,7.86;5,53.80,619.80,174.75,7.86">Policy Based Routing: On receiving a request, the library decides which partition to involve (randomly chosen for puts and extracted from blob id for gets/deletes).</s><s coords="5,232.38,619.80,60.54,7.86;5,53.80,629.43,239.11,7.86;5,53.80,639.05,239.11,7.86;5,53.80,648.67,111.20,7.86">Then, based on the policy used ({one, k, majority, all} discussed in Section 2.3), it communicates with the corresponding replica(s) until the request is served/failed.</s></p><p><s coords="5,53.80,667.90,239.11,7.89;5,53.80,677.55,239.12,7.86;5,316.81,231.55,30.68,7.86">Chunking: Extremely large blobs (e.g., videos) create load imbalance, block smaller blobs, and inherently have high latency.</s><s coords="5,352.75,231.55,203.17,7.86;5,316.81,241.17,175.01,7.86">To mitigate these issues, Ambry splits large blobs into smaller equal-size units called chunks.</s><s coords="5,498.15,241.17,57.76,7.86;5,316.81,250.80,239.12,7.86;5,316.81,260.42,175.89,7.86">A large chunk size does not fully resolve the large blob challenges and a small chunk size adds too much overhead.</s><s coords="5,500.39,260.42,55.52,7.86;5,316.81,270.05,239.11,7.86;5,316.81,279.67,208.38,7.86">Based on our current large blob size distribution, we found the sweet spot for the chunk size to be in the range of 4 to 8 MB<ref type="foot" coords="5,518.48,277.90,3.65,5.24" target="#foot_1">3</ref> .</s><s coords="5,325.78,289.29,230.14,7.86;5,316.81,298.92,212.96,8.35">During a put operation, a blob b is split into k chunks {c1, c2, ..., c k }, each treated as an independent blob.</s><s coords="5,536.10,298.92,19.83,7.86;5,316.81,308.54,239.11,7.86;5,316.81,318.17,239.11,7.86;5,316.81,327.79,37.85,7.86">Each chunk goes through the same steps as a normal put blob operation (Section 2.3), most likely being placed on a different partition.</s><s coords="5,358.46,327.79,197.45,7.86;5,316.81,337.41,81.56,7.86">It is also assigned a unique chunk id with the same format as a blob id.</s><s coords="5,403.94,337.41,151.98,7.86;5,316.81,347.04,239.11,8.35;5,316.81,356.66,239.11,7.86;5,316.81,366.28,36.50,7.86">In order to be able to retrieve b Ambry creates a metadata blob b metadata for b. b metadata stores the number of chunks and chunk ids in order, as shown in Figure <ref type="figure" coords="5,346.15,366.28,3.58,7.86" target="#fig_4">4</ref>.</s><s coords="5,358.28,366.28,197.64,7.86;5,316.81,375.91,239.11,8.35;5,316.81,385.53,98.82,7.86">This metadata blob is then put into Ambry as a normal blob and the blob id of b metadata is returned to the user as the blob id of b.</s><s coords="5,421.71,385.53,134.21,7.86;5,316.81,395.16,239.11,7.86;5,316.81,404.78,126.38,7.86">If the put fails before writing all chunks, the system will issue deletes for written chunks and the operation has to be redone.</s></p><p><s coords="5,325.78,414.40,230.14,7.86;5,316.81,424.03,86.70,7.86">During a get, the metadata blob is retrieved and chunk ids are extracted from it.</s><s coords="5,407.79,424.03,148.13,7.86;5,316.81,433.65,104.31,7.86">Then, Ambry uses a sliding buffer of size s to retrieve the blob.</s><s coords="5,425.15,433.65,130.78,7.86;5,316.81,443.28,239.10,7.86;5,316.81,452.90,239.10,7.86;5,316.81,462.52,48.75,7.86">Ambry queries the first s chunks of the blob independently and in parallel (since they are most likely written on unique partitions placed on separate Datanodes).</s><s coords="5,369.55,462.52,186.37,7.86;5,316.81,472.15,218.56,7.86">When the first chunk in the buffer is retrieved, Ambry slides the buffer to the next chunk, and so on.</s><s coords="5,540.05,472.15,15.87,7.86;5,316.81,481.77,239.11,7.86;5,316.81,491.40,139.78,7.86">The whole blob starts being returned to the user the moment the first chunk of the blob is retrieved.</s></p><p><s coords="5,325.78,501.02,230.14,7.86;5,316.81,510.64,239.12,7.86;5,316.81,520.27,239.11,7.86;5,316.81,529.89,73.69,7.86">Although an extra put/get is needed in this chunking mechanism (for the metadata blob), overall, our approach improves latency since multiple chunks are written and retrieved in parallel.</s></p><p><s coords="5,316.81,549.11,239.10,7.89;5,316.81,558.76,73.22,7.86">Zero-cost Failure Detection: Failures happen frequently in a large system.</s><s coords="5,396.39,558.76,159.53,7.86;5,316.81,568.39,183.67,7.86">They range from unresponsiveness and connection timeouts, to disk I/O problems.</s><s coords="5,510.10,568.39,45.83,7.86;5,316.81,578.01,239.11,7.86;5,316.81,587.63,239.11,7.86;5,316.81,597.26,23.02,7.86">Thus, Ambry needs a failure detection mechanism to discover unavailable Datanodes/disks and avoid forwarding requests to them.</s></p><p><s coords="5,325.78,606.88,230.15,7.86;5,316.81,616.51,239.11,7.86;5,316.81,626.13,128.79,7.86">Ambry employs a zero-cost failure detection mechanism involving no extra messages, such as heartbeats and pings, by leveraging request messages.</s><s coords="5,451.63,626.13,104.29,7.86;5,316.81,635.75,239.11,7.86;5,316.81,645.38,117.00,7.86">In practice, we found our failure detection mechanism is effective, simple, and consumes very little bandwidth.</s><s coords="5,439.95,645.38,115.97,7.86;5,316.81,655.00,36.28,7.86">This mechanism is shown in Figure <ref type="figure" coords="5,345.94,655.00,3.58,7.86" target="#fig_5">5</ref>.</s><s coords="5,357.43,655.00,198.49,7.86;5,316.81,664.62,239.11,7.86;5,316.81,674.25,164.03,7.86">In this approach, Ambry keeps track of the number of consecutive failed requests for a particular Datanode (or disk) in the last check period of time.</s><s coords="5,484.76,674.25,71.16,7.86;5,316.81,683.87,239.10,7.86;6,53.80,57.64,239.11,7.86;6,53.80,67.26,31.04,7.86">If this number exceeds a MAX FAIL threshold (in our example set to 2) the Datanode is marked as temporarily down for a wait period of time.</s><s coords="6,89.22,67.26,203.69,7.86;6,53.80,76.88,239.10,7.86;6,53.80,86.51,19.00,7.86">In this state all queued requests for this Datanode will eventually time out and need to be reattempted by the user.</s><s coords="6,78.90,86.51,214.01,7.86;6,53.80,96.13,120.07,7.86">After the wait period has passed, the Datanode becomes temporarily available.</s><s coords="6,180.06,96.13,112.84,7.86;6,53.80,105.76,239.11,7.86;6,53.80,115.38,239.11,7.86;6,53.80,125.00,24.03,7.86">When a Datanode is in the temporarily available phase, if the next request sent to that Datanode fails, it will move to the temporarily down phase again.</s><s coords="6,81.76,125.00,211.14,7.86;6,53.80,134.63,55.24,7.86">Otherwise, it will be marked as available, working as normal again.</s></p><p><s coords="6,53.80,153.85,239.11,7.89;6,53.80,163.50,239.10,7.86;6,53.80,173.12,163.78,7.86">Proxy Requests: As described in Section 2.3, Ambry uses proxy requests to reach higher availability and read-afterwrite consistency in remote datacenters.</s><s coords="6,223.13,173.12,69.78,7.86;6,53.80,182.75,239.10,7.86;6,53.80,192.37,239.11,7.86;6,53.80,201.99,92.34,7.86">When a blob has not been replicated in the local datacenter yet, requests for that blob are forwarded to other datacenters and served there (proxy requests).</s><s coords="6,151.15,201.99,141.77,7.86;6,53.80,211.62,239.12,7.86;6,53.80,221.24,126.20,7.86">However, datacenter partitions can cause unavailability of unreplicated data until the partition is healed and replicas converge.</s></p><p><s coords="6,62.77,230.87,230.14,7.86;6,53.80,240.49,173.55,7.86">Proxy requests are handled by the Router Library, transparently from the user issuing the request.</s><s coords="6,232.65,240.49,60.25,7.86;6,53.80,250.11,239.11,7.86;6,53.80,259.74,180.42,7.86">In practice, we found proxy requests occur less than 0.001% of the time, thus minimally affecting the user experience.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3" coords="6,53.80,276.02,109.46,10.75">Datanode Layer</head><p><s coords="6,62.77,289.42,230.14,7.86">Datanodes are in charge of maintaining the actual data.</s><s coords="6,53.80,299.05,239.11,7.86;6,53.80,308.67,199.96,7.86">Each Datanode manages a number of disks, and responds to requests for partition replicas placed on its disks.</s><s coords="6,258.67,308.67,34.23,7.86;6,53.80,318.29,200.03,7.86">Puts are handled by writing to the end of the partition file.</s><s coords="6,257.81,318.29,35.10,7.86;6,53.80,327.92,239.11,7.86;6,53.80,337.54,141.30,7.86">Gets can be more time-consuming, especially since the location of the blob in the partition is not known.</s><s coords="6,200.01,337.54,92.90,7.86;6,53.80,347.17,227.62,7.86">To minimize both read and write latencies, Datanodes employ a few techniques:</s></p><p><s coords="6,67.12,361.79,225.79,7.89;6,76.21,371.44,216.69,7.86;6,76.21,381.07,129.65,7.86">• Indexing blobs: Ambry stores an index of blob offsets per partition replica to prevent sequential sweeps for finding blobs (Section 4.3.1).</s><s coords="6,67.12,393.93,225.78,7.89;6,76.21,403.58,216.70,7.86;6,76.21,413.21,171.14,7.86;6,67.12,426.07,225.79,7.89;6,76.21,435.72,216.68,7.86;6,76.21,445.34,176.41,7.86">• Exploiting OS cache: Ambry utilizes OS caching to serve most reads from the RAM, by limiting the RAM usage of other components (Section 4.3.2). • Batched writes, with a single disk seek: Ambry batches writes for a particular partition together and periodically flushes the writes to disk.</s><s coords="6,259.66,445.34,33.24,7.86;6,76.21,454.97,216.69,7.86;6,76.21,464.59,26.68,7.86">Thus, it incurs at most one disk seek for a batch of sequential writes.</s><s coords="6,106.97,464.59,185.93,7.86;6,76.21,474.22,88.56,7.86">The flush period is configurable and trades off latency for durability.</s><s coords="6,170.85,474.22,122.05,7.86;6,76.21,483.84,216.69,7.86;6,76.21,493.46,154.12,7.86">Although, batching can introduce overheads of flushing, dirty buffers, and tuning, the benefits outweigh these overheads.</s><s coords="6,67.12,506.33,225.78,7.89;6,76.21,515.98,216.69,7.86;6,76.21,525.60,216.69,7.86;6,76.21,535.23,68.25,7.86">• Keeping all file handles open: Since partitions are typically very large (100 GB in our setting), the number of partition replicas placed on a Datanode is small (a few hundred).</s><s coords="6,150.39,535.23,142.52,7.86;6,76.21,544.85,70.35,7.86">Thus, Ambry keeps all file handles open at all times.</s><s coords="6,67.12,557.71,225.78,7.89;6,76.21,567.36,216.69,7.86;6,76.21,576.99,216.70,7.86;6,76.21,586.61,124.90,7.86">• Zero copy gets: When reading a blob, Ambry utilizes a zero copy <ref type="bibr" coords="6,128.33,567.36,14.31,7.86" target="#b24">[25]</ref> mechanism, i.e., the kernel directly copies data from disk to the network buffer without going through the application.</s><s coords="6,207.91,586.61,84.99,7.86;6,76.21,596.24,216.70,7.86;6,76.21,605.86,107.68,7.86">This is feasible since the Datanodes do not perform any computation on the data at get operations.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.1" coords="6,56.54,621.72,71.00,9.44">Indexing</head><p><s coords="6,62.77,634.19,230.14,7.86;6,53.80,643.82,239.11,7.86;6,53.80,653.44,172.33,7.86">To find the location of a blob in a partition replica with low latency, the Datanode maintains a light-weight in-memory indexing per replica, as shown in Figure <ref type="figure" coords="6,218.97,653.44,3.58,7.86" target="#fig_6">6</ref>.</s><s coords="6,230.52,653.44,62.39,7.86;6,53.80,663.07,239.10,7.86;6,53.80,672.69,58.16,7.86">The indexing is sorted by blob id, mapping the blob id to the start offset of the blob entry.</s><s coords="6,115.88,672.69,177.03,7.86;6,53.80,682.31,239.11,7.86;6,53.80,691.94,15.33,7.86">The indexing is updated in an online fashion whenever blobs are put (e.g., blob 60) or deleted (e.g., blob 20).</s></p><p><s coords="6,62.77,701.56,230.14,7.86;6,53.80,711.19,239.11,7.86;6,325.78,326.12,230.14,7.86;6,316.81,335.75,206.89,7.86">Similar to SSTables <ref type="bibr" coords="6,143.29,701.56,9.20,7.86" target="#b4">[5]</ref>, Ambry limits the size of the index by splitting it into segments, storing old segments on disk, The indexing also stores a flag indicating if a blob has been deleted and optionally a time-to-live (TTL).</s><s coords="6,527.89,335.75,28.03,7.86;6,316.81,345.37,239.11,7.86;6,316.81,354.99,214.15,7.86">During get operations, if the blob has expired or been deleted, an error will be returned before reading the actual data.</s></p><p><s coords="6,325.78,364.62,230.14,7.86;6,316.81,374.24,239.11,7.86;6,316.81,383.87,90.44,7.86">Note that the indexing does not contain any additional information affecting the correctness of the system, and just improves performance.</s><s coords="6,411.19,383.87,144.73,7.86;6,316.81,393.49,177.55,7.86">If a Datanode fails, the whole indexing can be reconstructed from the partition.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.2" coords="6,319.55,411.03,144.43,9.44">Exploiting The OS Cache</head><p><s coords="6,325.78,423.50,230.14,7.86;6,316.81,433.12,239.11,7.86;6,316.81,442.75,64.63,7.86">Recently written data, which is the popular data as well, is automatically cached without any extra overhead (by the operating system).</s><s coords="6,385.46,442.75,170.47,7.86;6,316.81,452.37,239.11,7.86;6,316.81,462.00,39.15,7.86">By exploiting this feature, many reads can be served from memory, which significantly improves performance.</s><s coords="6,361.62,462.00,194.29,7.86;6,316.81,471.62,137.50,7.86">Thus, Ambry limits the memory usage of other data structures in the Datanode.</s><s coords="6,462.38,471.62,93.54,7.86;6,316.81,481.24,239.11,7.86;6,316.81,490.87,170.14,7.86">Ambry bounds the indexing by splitting it into segments, with only the latest segment remaining in-memory (Figure <ref type="figure" coords="6,476.22,490.87,3.58,7.86" target="#fig_6">6</ref>).</s><s coords="6,492.46,490.87,63.46,7.86;6,316.81,500.49,195.96,7.86">New entries are added to the in-memory segment of the indexing.</s><s coords="6,516.71,500.49,39.21,7.86;6,316.81,510.11,239.11,7.86;6,316.81,519.74,114.76,7.86">When the in-memory segment exceeds a maximum size it is flushed to disk as a read-only segment.</s><s coords="6,436.26,519.74,119.66,7.86;6,316.81,529.36,239.11,7.86;6,316.81,538.99,69.24,7.86">This design also helps toward failure recovery since only the in-memory segment needs to be reconstructed.</s><s coords="6,390.09,538.99,165.83,7.86;6,316.81,548.61,239.10,7.86;6,316.81,558.23,73.79,7.86">Looking up blob offsets is done in reverse chronological order, starting with the latest segment (inmemory segment).</s><s coords="6,394.54,558.23,161.38,7.86;6,316.81,567.86,130.38,7.86">Thus, a delete entry will be found before a put entry when reading a blob.</s><s coords="6,451.13,567.86,104.79,7.86;6,316.81,577.48,76.53,7.86">This ensures deleted blobs are not retrievable.</s><s coords="6,316.81,587.08,239.10,7.89;6,316.81,596.73,239.11,7.86;6,316.81,606.35,221.52,7.86">Bloom Filters: To reduce lookup latency for on-disk segments, Ambry maintains an in-memory Bloom filter for each segment, containing the blob ids in that index segment.</s><s coords="6,542.32,606.35,13.60,7.86;6,316.81,615.98,239.10,7.86;6,316.81,625.60,66.05,7.86">Using Bloom filters, Ambry quickly filters out which on-disk segment to load.</s><s coords="6,386.91,625.60,169.01,7.86;6,316.81,635.23,55.60,7.86">Thus, with high probability, it incurs only one disk seek.</s><s coords="6,376.88,635.23,179.04,7.86;6,316.81,644.85,239.11,7.86;6,316.81,654.47,42.10,7.86">However, due to our skewed workload a majority of reads just hit the in-memory segment, without any disk seeks.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5." coords="6,316.81,678.54,104.14,10.75">REPLICATION</head><p><s coords="6,325.78,691.94,230.14,7.86;6,316.81,701.56,239.11,7.86;6,316.81,711.19,120.68,7.86">Replicas belonging to the same partition can become out of sync due to either failures, or asynchronous writes that write to only one datacenter.</s><s coords="6,444.61,711.19,111.31,7.86;7,53.80,218.22,239.11,7.86;7,53.80,227.84,159.63,7.86">In order to fix inconsistent replicas, Ambry uses an asynchronous replication algorithm that periodically synchronizes replicas.</s><s coords="7,221.26,227.84,71.64,7.86;7,53.80,237.47,101.34,7.86">This algorithm is completely decentralized.</s><s coords="7,159.36,237.47,133.55,7.86;7,53.80,247.09,239.11,7.86;7,53.80,256.71,111.74,7.86">In this procedure each replica individually acts as a master and syncs-up with other replicas, in an all-to-all fashion.</s><s coords="7,171.39,256.71,121.52,7.86;7,53.80,266.34,239.11,7.86">Synchronization is done using an asynchronous two-phase replication protocol as follows.</s><s coords="7,53.80,275.96,239.11,7.86;7,53.80,285.59,237.18,7.86">This protocol is a pull-based approach where each replica independently requests for missing blobs from other replicas.</s></p><p><s coords="7,67.12,301.43,225.78,7.89;7,76.21,311.08,106.76,7.86">• First Phase: This phase finds missing blobs since the last synchronization point.</s><s coords="7,187.02,311.08,105.89,7.86;7,76.21,320.70,216.69,7.86;7,76.21,330.32,194.52,7.86">This is done by requesting blob ids of all blobs written since the latest syncing offset and then filtering the ones missing locally.</s></p><p><s coords="7,67.12,343.79,225.78,7.89">• Second Phase: This phase replicates missing blobs.</s></p><p><s coords="7,76.21,353.44,185.79,7.86">A request for only the missing blobs is sent.</s><s coords="7,269.38,353.44,23.53,7.86;7,76.21,363.07,216.69,7.86;7,76.21,372.69,29.17,7.86">Then, the missing blobs are transferred and appended to the replica.</s></p><p><s coords="7,62.77,388.56,230.14,7.86;7,53.80,398.18,239.11,7.86;7,53.80,407.80,140.89,7.86">In order to find the recently written blobs quickly, the replication algorithm maintains an additional data structure per replica, called a journal.</s><s coords="7,203.48,407.80,89.43,7.86;7,53.80,417.43,209.25,7.86">The journal is an inmemory cache of recent blobs ordered by their offset.</s><s coords="7,266.93,417.43,25.97,7.86;7,53.80,427.05,239.11,7.86;7,53.80,436.68,239.11,7.86;7,53.80,446.30,65.63,7.86">Figure <ref type="figure" coords="7,53.80,427.05,4.61,7.86" target="#fig_8">7</ref> shows example journals of two replicas (r1 and r2) and the two phase replication procedure for r1 syncing with r2 from latestOffset 600.</s><s coords="7,124.48,446.30,168.42,7.86;7,53.80,455.92,239.11,7.86;7,53.80,465.55,239.11,7.86;7,53.80,475.17,141.80,7.86">In the first phase, r1 requests all recently added blobs in r2 after latestOffset; using the journal r2 returns a list B ={55, 40, 70, 90} of blob ids; and r1 filters out the missing blobs (blob 55 and 90).</s><s coords="7,199.66,475.17,92.74,7.86;7,53.80,484.79,239.11,7.86;7,53.80,494.42,239.11,7.86;7,53.80,504.04,13.87,7.86">In the second phase, r1 receives the missing blobs, appends the blobs to the end of the replica, and updates the journal, indexing and latestOffset.</s></p><p><s coords="7,62.77,513.67,230.14,7.86;7,53.80,523.29,239.11,7.86;7,53.80,532.91,56.80,7.86">To provide improved efficiency and scalability of the system, the replication algorithm employs a number of further optimizations:</s></p><p><s coords="7,67.12,548.78,225.78,7.86;7,76.21,558.40,161.99,7.86">• Using separate thread pools for inter-and intra-datacenter replication with different periods.</s></p><p><s coords="7,67.12,573.32,225.78,7.86;7,76.21,582.94,219.77,7.86;7,76.21,592.56,48.40,7.86">• Batching requests between common partition replicas of two Datanodes, and batching blobs transferred across datacenters.</s></p><p><s coords="7,67.12,607.48,225.78,7.86;7,76.21,617.10,196.81,7.86">• Prioritizing lagging replicas to catch up at a faster rate (by using dedicated threads for lagging replicas).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6." coords="7,53.80,638.42,173.85,10.75">EXPERIMENTAL RESULTS</head><p><s coords="7,62.76,651.82,230.13,7.86;7,53.80,661.45,239.11,7.86;7,53.80,671.07,83.37,7.86">We perform three kinds of experiments: small cluster (Section 6.1), production cluster (Section 6.2), and simulations (Section 6.3).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1" coords="7,53.80,688.16,155.56,10.75">Throughput and Latency</head><p><s coords="7,62.76,701.56,230.14,7.86;7,53.80,711.19,239.10,7.86;7,316.81,57.64,89.55,7.86">In this section we measure the latency and throughput of the system using a micro-benchmark that stress-tests the system (Section 6.1.1),</s><s coords="7,408.69,57.64,147.23,7.86;7,316.81,67.26,66.08,7.86">under read-only, write-only and readwrite workloads.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.1" coords="7,319.55,86.38,112.56,9.44">Micro-Benchmark</head><p><s coords="7,325.78,98.85,162.42,7.86">We first measure the peak throughput.</s><s coords="7,495.75,98.85,60.17,7.86;7,316.81,108.48,239.11,7.86;7,316.81,118.10,239.11,7.86;7,316.81,127.72,125.45,7.86">We designed a micro-benchmark that linearly adds load to the system (by adding more clients), until the saturation point where no more requests can be handled.</s><s coords="7,448.66,127.72,107.26,7.86;7,316.81,137.35,239.11,7.86;7,316.81,146.97,28.99,7.86">Each client sends requests one at a time with the next request sent right after a response.</s></p><p><s coords="7,325.78,156.60,230.14,7.86;7,316.81,166.22,25.10,7.86">This benchmark has three modes: Write, Read, and Read-Write.</s><s coords="7,345.74,166.22,210.19,7.86;7,316.81,175.84,108.00,7.86">In Write mode, random byte array blobs are put with varying number of clients.</s><s coords="7,432.22,175.84,123.70,7.86;7,316.81,185.47,211.25,7.86">In Read mode, first blobs are written at saturation rate for a write-period of time.</s><s coords="7,532.39,185.47,23.54,7.86;7,316.81,195.09,219.95,7.86;7,536.75,193.32,3.65,5.24;7,540.91,195.09,2.56,7.86">Then, randomly chosen blobs are read from the written blobs <ref type="foot" coords="7,536.75,193.32,3.65,5.24" target="#foot_2">4</ref> .</s><s coords="7,547.49,195.09,8.44,7.86;7,316.81,204.72,239.11,7.86;7,316.81,214.34,239.11,7.86;7,316.81,223.96,24.68,7.86">In most experiments we set the write-period long enough that most read requests (&gt;80%) are served by disk, rather than RAM.</s><s coords="7,345.88,223.96,210.04,7.86;7,316.81,233.59,178.43,7.86">Read-Write is a similar to Read, but serving 50% reads and 50% writes after the write-period.</s></p><p><s coords="7,325.78,243.21,230.14,7.86;7,316.81,252.83,239.11,7.86;7,316.81,262.46,37.38,7.86">Since latency and throughput are highly correlated with blob size, we use fixed-size blobs in each run, but vary the blob size.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.2" coords="7,319.55,281.58,119.01,9.44">Experimental Setup</head><p><s coords="7,325.78,294.05,179.43,7.86">We deployed Ambry with a single Datanode.</s><s coords="7,509.21,294.05,46.70,7.86;7,316.81,303.67,239.11,7.86;7,316.81,313.30,239.11,7.86">The Datanode was running on a 24 core CPU with 64 GB of RAM, 14 1TB HDD disks, and a full-duplex 1 Gb/s Ethernet network.</s><s coords="7,316.81,322.92,239.11,7.86;7,316.81,332.55,204.97,7.86">4 GB of the RAM was set aside for the Datanode's internal use and the rest was left to be used as Linux Cache.</s><s coords="7,525.70,332.55,30.23,7.86;7,316.81,342.17,239.11,7.86;7,316.81,351.79,88.28,7.86">We created 8 single-replica 100 GB partitions on each disk, with a total of 122 partitions.</s><s coords="7,408.89,351.79,147.03,7.86;7,316.81,361.42,104.77,7.86">Using 14 disks with a 1 Gb/s network might look like an overkill.</s><s coords="7,425.43,361.42,130.49,7.86;7,316.81,371.04,147.25,7.86">However, disk seeks are the dominant latency factor for small blobs.</s><s coords="7,470.22,371.04,85.71,7.86;7,316.81,380.67,239.11,7.86;7,316.81,390.29,239.11,7.86">Since a large portion of blobs are small (&lt; 50 KB), we need the parallelism created by using multiple disks (more detail in Section 6.1.4).</s><s coords="7,316.81,399.91,239.11,7.86;7,316.81,409.54,72.21,7.86">Note that Ambry is designed as a cost-effective system using cheap HDD disks.</s></p><p><s coords="7,325.78,419.16,230.14,7.86;7,316.81,428.78,140.20,7.86">Clients send requests from a few machines located in the same datacenter as the Datanode.</s><s coords="7,463.47,428.78,92.44,7.86;7,316.81,438.41,239.11,7.86;7,316.81,448.03,16.62,7.86">These clients, that are acting as Frontends, directly send requests to the Datanode.</s><s coords="7,338.79,448.03,217.13,7.86;7,316.81,457.66,239.11,7.86;7,316.81,467.28,77.87,7.86">The micro-benchmark discussed above was used with varying blob sizes {25 KB, 50 KB, 100 KB, 250 KB, 500 KB, 1 MB, 5 MB}.</s><s coords="7,399.17,467.28,156.74,7.86;7,316.81,476.90,126.57,7.86">We did not go above 5 MB since blobs are chunked beyond that point.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.3" coords="7,319.55,496.02,152.62,9.44">Effect of Number of Clients</head><p><s coords="7,325.78,508.50,231.79,7.86;7,316.81,518.12,167.18,7.86">We ran the micro-benchmark with varying blob sizes, while linearly increasing the number of clients.</s><s coords="7,490.11,518.12,65.81,7.86;7,316.81,527.74,239.10,7.86;7,316.81,537.37,51.48,7.86">For Read mode, the write-period was set such that 6 times the RAM size, was first written.</s><s coords="7,374.65,537.37,181.27,7.86;7,316.81,546.99,114.79,7.86">Figure <ref type="figure" coords="7,404.44,537.37,9.21,7.86" target="#fig_10">8a</ref> shows the throughput in terms of MB/s served by the system.</s><s coords="7,436.91,546.99,119.01,7.86;7,316.81,556.62,205.49,7.86">Adding clients proportionally increases the throughput until the saturation point.</s><s coords="7,526.25,556.62,29.67,7.86;7,316.81,566.24,200.00,7.86">Saturation occurs at 75%-88% of the network bandwidth.</s><s coords="7,520.63,566.24,35.29,7.86;7,316.81,575.86,239.11,7.86;7,316.81,585.49,113.76,7.86">The only exception is reading small blobs due to frequent disk seeks (discussed in Section 6.1.4).</s><s coords="7,436.83,585.49,119.09,7.86;7,316.81,595.11,239.11,7.86;7,316.81,604.74,108.01,7.86">Saturation is reached quickly (usually ≤ 6 clients) since clients send requests as fast as possible in the benchmark.</s></p><p><s coords="7,325.78,614.36,230.15,7.86;7,316.81,623.98,158.85,7.86">Figure <ref type="figure" coords="7,354.64,614.36,9.72,7.86" target="#fig_10">8b</ref> shows the latency normalized by blob size (i.e., average latency divided by blob size).</s><s coords="7,484.35,623.98,71.57,7.86;7,316.81,633.61,239.12,7.86;7,316.81,643.23,239.10,7.86">Latency stays almost constant before reaching saturation point, and then increases linearly beyond the throughput saturation point.</s><s coords="7,316.81,652.85,239.10,7.86;7,316.81,662.48,239.11,7.86;7,316.81,672.10,31.01,7.86">The linear increase in latency after saturation indicates the system does not add additional overhead beyond request serving.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.4" coords="8,56.54,496.61,113.35,9.44">Effect of Blob Size</head><p><s coords="8,62.77,509.08,230.14,7.86;8,53.80,518.71,239.11,7.86;8,53.80,528.33,23.05,7.86">In Figures <ref type="figure" coords="8,106.44,509.08,9.21,7.86">9a</ref> and<ref type="figure" coords="8,136.11,509.08,9.72,7.86">9b</ref> we analyzed the maximum throughput (with 20 clients) under different blob sizes and workloads.</s><s coords="8,80.82,528.33,212.09,7.86;8,53.80,537.96,239.11,7.86;8,53.80,547.58,148.76,7.86">For large objects (&gt;200 KB), the maximum throughput (in MB/s) stays constant and close to maximum network bandwidth across all blob sizes.</s><s coords="8,206.58,547.58,86.32,7.86;8,53.80,557.20,175.53,7.86">Similarly, throughput in terms of requests/s scales proportionally.</s></p><p><s coords="8,62.77,566.83,230.14,7.86;8,53.80,576.45,195.87,7.86">However, for Read and Read-Write, the read throughput in terms of MB/s drops linearly for smaller sizes.</s><s coords="8,253.68,576.45,39.23,7.86;8,53.80,586.08,239.11,7.86;8,53.80,595.70,110.98,7.86">This drop is because our micro-benchmark reads blobs randomly, incurring frequent disk seeks.</s><s coords="8,169.90,595.70,123.01,7.86;8,53.80,605.32,97.08,7.86">The effect of disk seeks is amplified for smaller blobs.</s><s coords="8,155.01,605.32,137.90,7.86;8,53.80,614.95,239.11,7.86;8,53.80,624.57,239.11,7.86;8,53.80,634.19,128.56,7.86">By further profiling the disk using Bonnie++ <ref type="bibr" coords="8,100.46,614.95,9.72,7.86" target="#b0">[1]</ref> (an IO benchmark for measuring disk performance), we confirmed that disk seeks are the dominant source of latency for small blobs.</s><s coords="8,186.18,634.19,106.73,7.86;8,53.80,643.82,239.10,7.86;8,53.80,653.44,19.96,7.86">For example, when reading a 50 KB blob, more than 94% of latency is due to disk seek (6.49</s><s coords="8,76.83,653.44,206.01,7.86">ms for disk seek, and 0.4 ms for reading the data).</s></p><p><s coords="8,62.77,663.07,230.14,7.86;8,53.80,672.69,215.38,7.86">Read and Write workload mostly utilize only the outbound and inbound link on the Datanode, respectively.</s><s coords="8,272.93,672.69,20.98,7.86;8,53.80,682.31,239.10,7.86;8,53.80,691.94,179.94,7.86">However, Read-Write workload has a better mix and evenly utilizes both links reaching higher throughput.</s><s coords="8,240.02,691.94,52.89,7.86;8,53.80,701.56,239.10,7.86;8,53.80,711.19,239.11,7.86;8,316.81,162.73,239.11,7.86;8,316.81,172.36,40.90,7.86">Therefore, in our full-duplex network infrastructure the Read-Write mode is able to saturate both links reaching almost double the network bandwidth ( 1.7 Gb/s in total out of the 2 Gb/s available).</s><s coords="8,361.62,172.36,194.30,7.86;8,316.81,181.98,239.11,7.86;8,316.81,191.61,100.74,7.86">For smaller size objects it reaches twice the Read throughput, since Read-Write is a 50-50 workload with reads being the limiting factor.</s><s coords="8,325.78,201.23,230.14,7.86;8,316.81,210.85,56.90,7.86">Figure <ref type="figure" coords="8,355.51,201.23,8.69,7.86">9c</ref> demonstrates the trend in latency under various blob sizes.</s><s coords="8,377.70,210.85,178.22,7.86;8,316.81,220.48,55.59,7.86">These results are before the saturation point with 2 clients.</s><s coords="8,376.33,220.48,179.59,7.86;8,316.81,230.10,123.85,7.86">Similar to throughput, latency grows linearly except for reading small blobs.</s><s coords="8,445.10,230.10,110.82,7.86;8,316.81,239.73,239.10,7.86;8,316.81,249.35,170.24,7.86">The higher latency in Read is because most read requests incur a disk seek, while write requests are batched and written together.</s><s coords="8,491.05,249.35,64.87,7.86;8,316.81,258.97,239.10,7.86;8,316.81,268.60,114.16,7.86">The Read-Write latency falls halfway between Read and Write latency because of its mixed workload.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.5" coords="8,319.55,284.50,120.54,9.44">Variance in Latency</head><p><s coords="8,325.78,296.98,230.14,7.86">The tail and variance in request latencies are important.</s><s coords="8,316.81,306.60,239.10,7.86;8,316.81,316.23,137.03,7.86">Figure <ref type="figure" coords="8,345.88,306.60,9.21,7.86" target="#fig_11">10</ref> shows the CDFs of Write, Read, and Read-Write mode experiments with 2 clients.</s><s coords="8,461.07,316.23,94.86,7.86;8,316.81,325.85,239.11,7.86;8,316.81,335.47,213.93,7.86">The CDF of Read and Write mode is very close to a vertical line, with a short tail and a majority of values close to the median.</s><s coords="8,540.05,335.47,15.87,7.86;8,316.81,345.10,239.10,7.86;8,316.81,354.72,239.11,7.86;8,316.81,364.35,239.11,7.86;8,316.81,373.97,61.42,7.86">The jump around 0.15 in Read mode (for 50 KB blob size) is because a small fraction of requests are served using the Linux cache which is orders of magnitudes faster than disk (Section 6.1.6).</s><s coords="8,384.08,373.97,171.84,7.86;8,316.81,383.59,239.11,7.86;8,316.81,393.22,177.53,7.86">The Read-Write mode is a mixture of the Read and Write CDF with a change around 0.5, following the 50% read -50% write workload pattern.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.1.6" coords="8,319.55,409.12,127.79,9.44">Effect of Linux Cache</head><p><s coords="8,325.78,421.60,230.14,7.86;8,316.81,431.22,239.11,7.86;8,316.81,440.85,239.11,7.86;8,316.81,450.47,239.11,7.86;8,316.81,460.09,182.40,7.86">We ran the micro-benchmark on 50 KB blobs and 2 clients in two configurations: 1) writing 6 times more than the RAM size before reading, so that most requests (83 %) are served by disk (Disk Read), and 2) writing data equal to the RAM size to keep all data in RAM (Cached Read).</s><s coords="8,503.42,460.09,52.50,7.86;8,316.81,469.72,93.01,7.86">Table <ref type="table" coords="8,528.78,460.09,4.61,7.86" target="#tab_5">3</ref> compares these two modes.</s></p><p><s coords="8,325.78,479.34,230.15,7.86;8,316.81,488.97,239.11,7.86;8,316.81,498.59,239.11,7.86">The Cached Read experiment performed more than 2100 requests/s (104 MB/s reaching 79% network bandwidth) matching the maximum write throughput (Section 6.1.3),</s><s coords="8,316.81,508.21,178.86,7.86">compared to 540 requests/s for Disk Reads.</s><s coords="8,501.20,508.21,54.72,7.86;8,316.81,517.84,239.10,7.86;8,316.81,527.46,140.26,7.86">We also measured the average, maximum, minimum, and standard deviation of latency, shown in Table <ref type="table" coords="8,449.92,527.46,3.58,7.86" target="#tab_5">3</ref>.</s><s coords="8,461.83,527.46,94.10,7.86;8,316.81,537.09,197.21,7.86">In both cases, the minimum is equal to reading from the Linux Cache.</s><s coords="8,519.30,537.09,36.62,7.86;8,316.81,546.71,239.11,7.86;8,316.81,556.33,117.43,7.86">However, the Cached Read case improves the average and max latency by 5.5x and 13x, respectively.</s><s coords="8,438.19,556.33,117.72,7.86;8,316.81,565.96,172.94,7.86">This shows the significance of exploiting the Linux Cache (Section 4.3.2).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.2" coords="8,316.81,582.79,183.08,10.75">Geo-distributed Optimizations</head><p><s coords="8,325.78,596.19,230.14,7.86;8,316.81,605.81,239.11,7.86;8,316.81,615.44,57.43,7.86">We analyzed our replication algorithm among 3 different datacenters at LinkedIn {DC1, DC2, DC3}, located all across the US.</s><s coords="8,377.20,615.44,178.72,7.86;8,316.81,625.06,75.74,7.86">All experiments in this section are from production workloads.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.2.1" coords="8,319.55,640.97,103.09,9.44">Replication Lag</head><p><s coords="8,325.78,653.44,230.14,7.86;8,316.81,663.07,239.11,7.86;8,316.81,672.69,106.27,7.86">We define replication lag between a pair of replicas (r1, r2) as the difference of r2's highest used offset and r1's latest synced offset with r2.</s><s coords="8,431.49,672.69,124.43,7.86;8,316.81,682.31,239.11,7.86;8,316.81,691.94,211.56,7.86">Note that not all data in the replication lag needs to be transfered to r1, since it could receive the missing blobs from other replicas as well.</s></p><p><s coords="8,325.78,701.56,230.15,7.86;8,316.81,711.19,239.11,7.86;9,53.80,525.44,90.14,7.86;9,175.62,525.44,117.28,7.86;9,53.80,535.06,195.37,7.86">We measured the replication lag among all replicas of a given Datanode and the rest of the cluster, and found that more than 85% of the were 0. Figure <ref type="figure" coords="9,240.20,525.44,9.21,7.86" target="#fig_12">11</ref> shows the CDF of non-zero values grouped by datacenter.</s><s coords="9,255.37,535.06,37.54,7.86;9,53.80,544.69,239.10,7.86;9,53.80,554.31,239.11,7.86;9,53.80,563.93,141.85,7.86">The 95th percentile is less than 1 KB for 100 GB partitions (in all datacenters), with slightly worse lag in datacenter 3 since it is placed farther apart from others.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.2.2" coords="9,56.54,583.23,133.64,9.44">Replication Bandwidth</head><p><s coords="9,62.77,595.70,230.15,7.86;9,53.80,605.32,71.44,7.86">Ambry relies on background replication to write data to other datacenters.</s><s coords="9,128.98,605.32,167.12,7.86;9,53.80,614.95,239.10,7.86;9,53.80,624.57,115.55,7.86">We measured the aggregate network bandwidth used for inter-datacenter replication during a 24 hour period, shown in Figure <ref type="figure" coords="9,157.59,624.57,7.84,7.86" target="#fig_0">12</ref>.</s><s coords="9,176.89,624.57,116.02,7.86;9,53.80,634.19,239.11,7.86;9,53.80,643.82,185.31,7.86">The aggregate bandwidth is small (&lt; 10 MB/s), similar across all datacenters, and correlated to request rates with a diurnal pattern.</s><s coords="9,243.02,643.82,49.88,7.86;9,53.80,653.44,239.10,7.86;9,53.80,663.07,203.58,7.86">This value is small because we batch replication between common replicas and due to the read-heavy nature of the workload.</s></p><p><s coords="9,62.77,672.69,230.14,7.86;9,53.80,682.31,242.78,7.86;9,53.80,691.94,45.02,7.86">Figure <ref type="figure" coords="9,92.93,672.69,9.21,7.86">13</ref> demonstrates the CDF of average replication bandwidth per Datanode, for both intra-and inter-datacenter replication.</s><s coords="9,104.04,691.94,188.87,7.86;9,53.80,701.56,239.11,7.86;9,53.80,711.19,239.11,7.86">Intra-datacenter bandwidth is minimal (&lt; 200 B/s at 95th percentile), especially compared to inter-datacenter with 150-200 KB/s at 95th percentile (1000x larger).</s><s coords="9,316.81,506.35,239.11,7.86;9,316.81,515.98,65.93,7.86">The higher value for inter-datacenter is because of asynchronous writes.</s><s coords="9,388.54,515.98,167.38,7.86;9,316.81,525.60,169.61,7.86">However, the inter-datacenter bandwidth is still negligible (≈ 0.2% of a 1 Gb/s link).</s><s coords="9,490.31,525.60,65.61,7.86;9,316.81,535.22,239.11,7.86;9,316.81,544.85,73.00,7.86">The small difference among the 3 datacenters is due to the different request rates they receive.</s><s coords="9,325.78,554.47,231.18,7.86;9,316.81,564.10,45.49,7.86">Figure <ref type="figure" coords="9,353.80,554.47,9.21,7.86" target="#fig_14">14</ref> shows a zoomed in graph of only inter-datacenter bandwidth.</s><s coords="9,366.09,564.10,189.84,7.86;9,316.81,573.72,188.05,7.86">Inter-datacenter replication has a short tail with the 95th to 5th percentile ratio of about 3x.</s><s coords="9,513.01,573.72,42.91,7.86;9,316.81,583.34,212.03,7.86">This short tail is because of the load-balanced design in Ambry.</s><s coords="9,532.89,583.34,23.03,7.86;9,316.81,592.97,239.12,7.86;9,316.81,602.59,135.72,7.86">Intradatacenter replication has a longer tail, as well as many zero values (omitted from the graph).</s><s coords="9,459.39,602.59,96.52,7.86;9,316.81,612.22,239.11,7.86;9,316.81,621.84,145.76,7.86">Thus, replication either uses almost zero bandwidth (intra-datacenter) or almost balanced bandwidth (inter-datacenter).</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.2.3" coords="9,319.55,640.97,120.84,9.44">Replication Latency</head><p><s coords="9,325.78,653.44,230.14,7.86;9,316.81,663.07,238.60,8.35;9,316.81,672.69,104.11,8.35">We define replication latency as the time spent in one iteration of the replication protocol, i.e., T missing blobs received minus T replication initiated .</s><s coords="9,424.82,672.69,131.10,7.86;9,316.81,682.31,239.11,7.86;9,316.81,691.94,112.79,7.86">Figure <ref type="figure" coords="9,453.33,672.69,9.21,7.86" target="#fig_15">15</ref> demonstrates the CDF of average replication latency for intra-and inter-datacenter, in our production workload.</s></p><p><s coords="9,325.78,701.56,230.14,7.86;9,316.81,711.19,166.59,7.86">Inter-datacenter replication latency is low with a median of less than 150 ms, and a very short tail.</s><s coords="9,487.42,711.19,68.50,7.86;10,53.80,464.18,239.11,7.86;10,53.80,473.81,119.21,7.86">Although this la- tency might appear to be high, the number of proxy requests remain near-zero (&lt; 0.001%).</s><s coords="10,177.08,473.81,115.82,7.86;10,53.80,483.43,239.11,7.86;10,53.80,493.05,66.14,7.86">This is because users usually read data from the same local datacenter to which they have recently written.</s><s coords="10,123.89,493.05,169.01,7.86;10,53.80,502.68,76.82,7.86">Therefore, replication has a minimal effect on user experience.</s><s coords="10,62.77,512.30,230.14,7.86;10,53.80,521.92,239.11,7.86;10,53.80,531.55,35.56,7.86">Surprisingly, intra-datacenter replication latency is relatively high (6x more than inter-datacenter) and with little variance.</s><s coords="10,93.59,531.55,199.31,7.86;10,53.80,541.17,239.11,7.86;10,53.80,550.80,140.73,7.86">This pattern is because of a pre-existing and prefixed artificial added delay of 1 second, intended to prevent incorrect blob collision detections.</s><s coords="10,202.18,550.80,90.72,7.86;10,53.80,560.42,239.11,7.86;10,53.80,570.04,239.11,7.86;10,53.80,579.67,239.12,7.86;10,53.80,589.29,63.54,7.86">If a blob is replicated in a datacenter faster than the Datanode receives the initial put request (which is possible with less strict policies), the Datanode might consider the put as a blob collision and fail the request.</s><s coords="10,122.26,589.29,170.64,7.86;10,53.80,598.92,148.17,7.86">The artificial delay is used to prevent this issue in intra-datacenter replication.</s><s coords="10,208.83,598.92,84.07,7.86;10,53.80,608.54,239.11,7.86;10,53.80,618.16,239.11,7.86;10,53.80,627.79,175.48,7.86">This relatively small delay does not have a significant impact on durability or availability of data since intra-replication is only used to fix failed/slow Datanodes, which rarely occurs.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3" coords="10,53.80,649.66,107.62,10.75">Load Balancing</head><p><s coords="10,62.77,663.07,231.18,7.86;10,53.80,672.69,239.10,7.86;10,53.80,682.31,239.11,7.86;10,53.80,691.94,149.50,7.86">Since cluster growth occurs infrequently (every few months at most), we implemented a simulator to show the behavior of Ambry over a long period of time (several years), and at large scale (hundreds of Datanodes).</s><s coords="10,209.20,691.94,83.71,7.86;10,53.80,701.56,160.60,7.86">We tried a workload that is based on production workloads.</s><s coords="10,220.71,701.56,72.20,7.86;10,53.80,711.19,164.10,7.86">All results in this section are gathered using the simulator.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.1" coords="10,319.55,465.56,109.90,9.44">Simulator Design</head><p><s coords="10,325.78,478.04,232.11,7.86;10,316.81,487.66,160.46,7.86">The simulator's design resembles Ambry's architecture and requests are served using the same path.</s><s coords="10,481.21,487.66,74.71,7.86;10,316.81,497.29,86.54,7.86">However, there are no real physical disks.</s><s coords="10,407.23,497.29,148.69,7.86;10,316.81,506.91,239.10,7.86;10,316.81,516.53,192.64,7.86">For handling requests, only the metadata (blob id and size) is stored/retrieved, and the effect of requests are reflected (e.g., disk space increase).</s><s coords="10,316.81,526.13,239.11,7.89;10,316.81,535.78,136.50,7.86">Workload: We use a synthetic workload closely resembling the real-world traffic at LinkedIn.</s><s coords="10,458.37,535.78,97.55,7.86;10,316.81,545.40,239.11,7.86;10,316.81,555.03,239.11,7.86;10,316.81,564.65,61.40,7.86">This workload preserves the rate of each type of request (read, write, and delete), the blob size distribution, and the access pattern of blobs (based on age).</s><s coords="10,316.81,574.25,239.10,7.89;10,316.81,583.90,239.11,7.86">Cluster Expansion: The simulator starts with an initial set of Datanodes, disks, and partitions in one datacenter.</s><s coords="10,316.81,593.52,239.11,7.86;10,316.81,603.15,239.11,7.86;10,316.81,612.77,133.99,7.86">Over time, when partitions reach the capacity threshold, a new batch of partitions are added using the replica placement strategy from Section 2.2.</s><s coords="10,460.00,612.77,95.91,7.86;10,316.81,622.40,239.12,7.86;10,316.81,632.02,179.01,7.86">If partitions cannot be added (e.g., if there is not enough unallocated space on disks), a batch of new Datanodes are added.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.2" coords="10,319.55,650.59,110.49,9.44">Experiment Setup</head><p><s coords="10,325.78,663.07,230.15,7.86;10,316.81,672.69,45.92,7.86">The simulation is run in a single datacenter, with 10 Frontend nodes.</s><s coords="10,366.68,672.69,189.24,7.86;10,316.81,682.31,239.11,7.86;10,316.81,691.94,14.88,7.86">The experiment starts with 15 Datanodes, each with 10 4TB disks, and 1200 100GB partitions with 3 replicas.</s><s coords="10,335.58,691.94,220.35,7.86;10,316.81,701.56,239.11,7.86">At each partition and Datanode addition point, a batch of 600 partitions and 15 Datanodes are added, respectively.</s><s coords="10,316.81,711.19,239.12,7.86;11,53.80,449.77,86.18,7.86">The simulation is run over 400 weeks (almost 8 years) and up to 240 Datanodes.</s><s coords="11,144.02,449.77,148.89,7.86;11,53.80,459.39,239.11,7.86;11,53.80,469.01,230.45,7.86">The simulation is run with and without rebalancing with the exact same configuration, while measuring request rates, disk usage, and data movement.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.3" coords="11,56.54,486.20,90.40,9.44">Request Rate</head><p><s coords="11,62.77,498.67,230.14,7.86;11,53.80,508.30,53.73,7.86">We measured the read rate (KB/s) for each disk at any point of time.</s><s coords="11,111.41,508.30,181.50,7.86;11,53.80,517.92,239.10,7.86;11,53.80,527.54,171.94,7.86">Figure <ref type="figure" coords="11,139.83,508.30,9.21,7.86" target="#fig_17">16</ref> demonstrates the average, standard deviation, maximum and minimum among these values, for the system with and without rebalancing.</s><s coords="11,232.14,527.54,60.76,7.86;11,53.80,537.17,234.26,7.86">The results for write rates were similar and removed due to lack of space.</s></p><p><s coords="11,62.77,546.79,230.15,7.86;11,53.80,556.42,35.53,7.86">The average, which is also the ideal, is a dropping step function.</s><s coords="11,94.30,556.42,198.60,7.86;11,53.80,566.04,84.27,7.86">The drops are points where new Datanodes were added to the cluster.</s><s coords="11,143.01,566.04,149.89,7.86;11,53.80,575.66,239.11,7.86;11,53.80,585.29,196.39,7.86">In case of no rebalancing, a majority of the disks are old read-only disks with almost zero traffic, while a few disks receive most of the request.</s><s coords="11,254.26,585.29,38.65,7.86;11,53.80,594.91,107.05,7.86">Thus, the minimum is close to zero.</s><s coords="11,168.23,594.91,124.67,7.86;11,53.80,604.54,239.11,7.86;11,53.80,614.16,105.34,7.86">Also, the maximum and standard deviation are significant (3x-7x and 1x-2x larger than the average, respectively).</s><s coords="11,164.06,614.16,128.84,7.86;11,53.80,623.78,239.11,7.86;11,53.80,633.41,160.18,7.86">When rebalancing is added, the minimum and maximum move close to the average, and the standard deviation drops close to zero.</s><s coords="11,220.63,633.41,72.28,7.86;11,53.80,643.03,142.79,7.86">We conclude that Ambry's load balancing is effective.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.4" coords="11,56.54,660.22,83.48,9.44">Disk Usage</head><p><s coords="11,62.77,672.69,230.15,7.86;11,53.80,682.31,239.11,7.86">We analyzed the disk usage ratio, i.e., used space divided by total space among disks, with and without rebalancing.</s><s coords="11,53.80,691.94,239.11,7.86;11,53.80,701.56,239.12,7.86;11,53.80,711.19,239.11,7.86;11,316.81,442.21,87.09,7.86">As seen in Figure <ref type="figure" coords="11,131.53,691.94,7.84,7.86" target="#fig_19">17</ref>, without rebalancing, the maximum stays at the capacity threshold since some disks become and remain full, while the minimum drops to zero whenever new Datanodes are added.</s><s coords="11,407.89,442.21,148.03,7.86;11,316.81,451.84,239.10,7.86;11,316.81,461.46,194.72,7.86">With rebalancing, the maximum and minimum move closer to the average with temporary drops in the minimum until rebalancing is completed.</s><s coords="11,517.32,461.46,38.60,7.86;11,316.81,471.08,239.11,7.86;11,316.81,480.71,239.11,7.86;11,316.81,490.33,27.14,7.86">Additionally, the standard deviation drops significantly, becoming almost zero with temporary spikes on Datanode addition points.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.5" coords="11,319.55,511.29,129.68,9.44">Evaluation Over Time</head><p><s coords="11,325.78,523.77,230.14,7.86;11,316.81,533.39,239.11,7.86;11,316.81,543.02,223.60,7.86">We evaluated the improvement over time by measuring the integral of range (max-min) and standard deviation for request rate and disk usage over the 400 week interval.</s><s coords="11,545.38,543.02,10.55,7.86;11,316.81,552.64,239.11,7.86;11,316.81,562.26,239.11,7.86;11,316.81,571.89,48.94,7.86">As shown in Table <ref type="table" coords="11,378.04,552.64,3.58,7.86" target="#tab_7">4</ref>, rebalancing has a prominent effect improving the range and standard deviation by 6x-9x and 8x-10x, respectively.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="6.3.6" coords="11,319.55,592.85,103.07,9.44">Data Movement</head><p><s coords="11,325.78,605.32,230.14,7.86;11,316.81,614.95,239.11,7.86;11,316.81,624.57,178.55,7.86">Whenever rebalancing is triggered, we measure the minimum data movement needed to reach an ideal state and the data movement caused by rebalancing.</s><s coords="11,503.44,624.57,52.47,7.86;11,316.81,634.19,239.11,7.86;11,316.81,643.82,239.11,7.86;11,316.81,653.44,65.22,7.86">We calculate the minimum data movement by adding the difference between ideal and current disk usage among all disks above ideal disk usage.</s><s coords="11,386.02,653.44,169.90,7.86;11,316.81,663.07,239.11,7.86;11,316.81,672.69,52.78,7.86">This value is a lower bound on the feasible minimum data movement since data is moved in granularity of partitions.</s><s coords="11,376.12,672.69,179.80,7.86;11,316.81,682.31,239.11,7.86">As shown in Figure <ref type="figure" coords="11,460.69,672.69,7.84,7.86" target="#fig_20">18</ref>, the data movement of rebalancing is very close and always below the minimum.</s><s coords="11,316.81,691.94,239.11,7.86;11,316.81,701.56,185.64,7.86">This is because the rebalancing algorithms trades off perfect balance (ideal state) for less data movement.</s><s coords="11,509.14,701.56,46.79,7.86;11,316.81,711.19,239.11,7.86;12,53.80,406.03,239.10,7.86;12,53.80,415.65,219.10,7.86">Specifically, the rebalancing algorithm usually does not remove (or add) a partition from a disk if it would go below (or above) the ideal state, even if this were to cause slight imbalance.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="7." coords="12,53.80,437.94,119.64,10.75">RELATED WORK</head><p><s coords="12,62.77,451.31,230.14,7.89;12,53.80,460.96,146.11,7.86">File Systems: The design of Ambry is inspired by logstructure file systems (LFS) <ref type="bibr" coords="12,169.99,460.96,14.32,7.86" target="#b20">[21,</ref><ref type="bibr" coords="12,185.59,460.96,10.74,7.86" target="#b22">23]</ref>.</s><s coords="12,204.47,460.96,88.43,7.86;12,53.80,470.59,239.11,7.86;12,53.80,480.21,239.10,7.86;12,53.80,489.84,23.60,7.86">These file systems are optimized for write throughput by sequentially writing in log-like data structures and relying on the OS cache for reads.</s><s coords="12,81.16,489.84,211.74,7.86;12,53.80,499.46,239.10,7.86;12,53.80,509.08,219.27,7.86">Although these single machine file systems suffer from fragmentation issues and cleaning overhead, the core ideas are very relevant, especially since blobs are immutable.</s><s coords="12,277.04,509.08,15.86,7.86;12,53.80,518.71,239.11,7.86;12,53.80,528.33,239.11,7.86;12,53.80,537.96,163.64,7.86">The main differences are the skewed data access pattern in our workload and additional optimization such as segmented indexing and Bloom filters used in Ambry.</s></p><p><s coords="12,62.77,547.58,230.14,7.86;12,53.80,557.20,63.45,7.86">There has been work on handling metadata and small files more efficiently.</s><s coords="12,122.06,557.20,170.85,7.86;12,53.80,566.83,239.10,7.86;12,53.80,576.45,239.10,7.86;12,53.80,586.08,239.11,7.86;12,53.80,595.70,95.66,7.86">Some of these techniques include reducing disk seeks <ref type="bibr" coords="12,95.75,566.83,9.20,7.86" target="#b8">[9]</ref>, using a combination of log-structured file systems (for metadata and small data) and fast file systems (for large data) <ref type="bibr" coords="12,118.34,586.08,13.49,7.86" target="#b29">[30]</ref>, and storing the initial segment of data in the index block <ref type="bibr" coords="12,132.58,595.70,13.50,7.86" target="#b16">[17]</ref>.</s><s coords="12,155.88,595.70,137.03,7.86;12,53.80,605.32,239.11,7.86;12,53.80,614.95,82.12,7.86">Our system resolves this issue by using in-memory segmented indexing plus Bloom filters and batching techniques.</s></p><p><s coords="12,62.77,624.54,230.14,7.89;12,53.80,634.19,239.11,7.86;12,53.80,643.82,239.11,7.86;12,53.80,653.44,239.11,7.86;12,53.80,663.07,101.16,7.86">Distributed File Systems: Due to the extremely large amount of data and data sharing needs, many distributed file systems such as NFS <ref type="bibr" coords="12,140.26,643.82,14.32,7.86" target="#b21">[22]</ref> and AFS <ref type="bibr" coords="12,195.98,643.82,13.49,7.86" target="#b15">[16]</ref>, and even more reliable ones handling failures, such as GFS, HDFS, and Ceph <ref type="bibr" coords="12,53.80,663.07,14.32,7.86" target="#b9">[10,</ref><ref type="bibr" coords="12,69.46,663.07,11.76,7.86" target="#b23">24,</ref><ref type="bibr" coords="12,82.56,663.07,11.76,7.86" target="#b27">28]</ref> have emerged.</s><s coords="12,159.80,663.07,133.10,7.86;12,53.80,672.69,239.11,7.86;12,53.80,682.31,239.11,7.86;12,53.80,691.94,98.74,7.86">However, all these systems suffer from the high metadata overhead and additional capabilities (e.g., nested directories, permissions, etc.) unnecessary for a simple blob store.</s><s coords="12,160.95,691.94,131.96,7.86;12,53.80,701.56,239.12,7.86;12,53.80,711.19,185.90,7.86">In many of these systems (e.g., HDFS, GFS, and NFS) the metadata overhead is magnified by having a separate single metadata server.</s><s coords="12,247.13,711.19,45.78,7.86;12,316.81,57.64,239.11,7.86;12,316.81,67.26,192.94,7.86">This server adds an extra hub in each request, becomes a single point of failure, and limits scalability beyond a point.</s><s coords="12,514.39,67.26,41.52,7.86;12,316.81,76.88,239.10,7.86;12,316.81,86.51,127.29,7.86">Recent research has addressed this problem by either distributing the metadata <ref type="bibr" coords="12,357.45,86.51,14.32,7.86" target="#b27">[28]</ref> or caching it <ref type="bibr" coords="12,427.22,86.51,13.50,7.86" target="#b19">[20]</ref>.</s><s coords="12,448.09,86.51,107.83,7.86;12,316.81,96.13,239.11,7.86;12,316.81,105.76,239.11,7.86;12,316.81,115.38,104.94,7.86">Although these systems alleviate accessing metadata, each small object still has a large metadata (usually stored on disk), decreasing the effective throughput of the system.</s></p><p><s coords="12,325.78,124.98,230.15,7.89;12,316.81,134.63,239.10,7.86;12,316.81,144.25,90.67,7.86">Distributed Data Stores: Many key-value stores, such as <ref type="bibr" coords="12,328.19,134.63,9.72,7.86" target="#b1">[2,</ref><ref type="bibr" coords="12,339.15,134.63,7.16,7.86" target="#b4">5,</ref><ref type="bibr" coords="12,347.55,134.63,7.16,7.86" target="#b7">8,</ref><ref type="bibr" coords="12,355.96,134.63,10.73,7.86" target="#b13">14]</ref>, have been designed to handle a large number of requests per second.</s><s coords="12,411.47,144.25,144.45,7.86;12,316.81,153.88,239.11,7.86;12,316.81,163.50,214.87,7.86">However, these systems cannot handle massively large objects (tens of MBs to GBs) efficiently, and add unnecessary overhead to provide consistency.</s><s coords="12,535.67,163.50,20.25,7.86;12,316.81,173.12,239.11,7.86;12,316.81,182.75,206.48,7.86">Also, some systems <ref type="bibr" coords="12,373.44,173.12,9.71,7.86" target="#b1">[2,</ref><ref type="bibr" coords="12,384.08,173.12,7.16,7.86" target="#b7">8,</ref><ref type="bibr" coords="12,392.15,173.12,11.76,7.86" target="#b13">14]</ref> hash data to machines, creating large data movement whenever nodes are added/deleted.</s><s coords="12,325.78,192.37,230.14,7.86;12,316.81,201.99,239.11,7.86;12,316.81,211.62,29.93,7.86">PNUTS <ref type="bibr" coords="12,361.93,192.37,9.72,7.86" target="#b5">[6]</ref> and Spanner <ref type="bibr" coords="12,432.17,192.37,9.71,7.86" target="#b6">[7]</ref> are scalable geographically distributed systems, where PNUTS maintains load balance as well.</s><s coords="12,352.22,211.62,203.70,7.86;12,316.81,221.24,239.11,7.86;12,316.81,230.87,22.07,7.86">However, both systems provide more features and stronger guarantees than needed in a simple immutable blob store.</s></p><p><s coords="12,325.78,240.46,230.14,7.89;12,316.81,250.11,147.52,7.86">Blob Stores: A similar concept to partitions in Ambry has been used in other systems.</s><s coords="12,470.05,250.11,85.86,7.86;12,316.81,259.74,239.11,7.86;12,316.81,269.36,198.42,7.86">Haystack uses logical volumes <ref type="bibr" coords="12,352.89,259.74,9.20,7.86" target="#b2">[3]</ref>, Twitter's blob store uses virtual buckets <ref type="bibr" coords="12,539.05,259.74,13.49,7.86" target="#b26">[27]</ref>, and Petal file system introduces virtual disks <ref type="bibr" coords="12,498.36,269.36,13.49,7.86" target="#b14">[15]</ref>.</s><s coords="12,519.18,269.36,36.74,7.86;12,316.81,278.99,239.10,7.86;12,316.81,288.61,163.66,7.86">Ambry is amenable to some optimizations in these systems such as the additional internal caching in Haystack.</s><s coords="12,487.04,288.61,68.88,7.86;12,316.81,298.23,239.11,7.86;12,316.81,307.86,42.71,7.86">However, neither Haystack nor Twitter's blob store tackle the problem of loadimbalance.</s><s coords="12,364.32,307.86,191.60,7.86;12,316.81,317.48,239.11,7.86;12,316.81,327.11,29.71,7.86">Additionally, Haystack uses synchronous writes across all replicas impacting efficiency in a geo-distributed setting.</s></p><p><s coords="12,325.78,336.73,230.14,7.86;12,316.81,346.35,239.11,7.86;12,316.81,355.98,55.31,7.86">Facebook has also designed f4 <ref type="bibr" coords="12,447.52,336.73,13.50,7.86" target="#b17">[18]</ref>, a blob store using erasure coding to reduce replication factor of old data (that has become cold).</s><s coords="12,376.15,355.98,179.77,7.86;12,316.81,365.60,239.11,7.86;12,316.81,375.22,90.90,7.86">Despite the novel ideas in this system, which potentially can be included in Ambry, our main focus is on both new and old data.</s><s coords="12,411.51,375.22,144.41,7.86;12,316.81,384.85,239.11,7.86;12,316.81,394.47,212.82,7.86">Oracle's Database <ref type="bibr" coords="12,485.48,375.22,14.32,7.86" target="#b18">[19]</ref> and Windows Azure Storage (WAS) <ref type="bibr" coords="12,405.57,384.85,9.71,7.86" target="#b3">[4]</ref> also store mutable blobs, and WAS is even optimized for a geo-distributed environment.</s><s coords="12,534.95,394.47,20.98,7.86;12,316.81,404.10,239.11,7.86;12,316.81,413.72,239.10,7.86;12,316.81,423.34,239.11,7.86;12,316.81,432.97,92.69,7.86">However, they both provide additional functionalities such as support for many data types other than blobs, strong consistency guarantees, and modification to blobs, that are not needed in our use case.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="8." coords="12,316.81,454.75,101.95,10.75">CONCLUSION</head><p><s coords="12,325.78,468.16,230.14,7.86;12,316.81,477.78,239.11,7.86;12,316.81,487.40,83.73,7.86">This paper described Ambry, a distributed storage system designed specifically for storing large immutable media objects, called blobs.</s><s coords="12,404.66,487.40,151.26,7.86;12,316.81,497.03,239.11,7.86;12,316.81,506.65,239.11,7.86;12,316.81,516.28,16.36,7.86">We designed Ambry to serve requests in a geographically distributed environment of multiple datacenters while maintaining low latency and high throughput.</s><s coords="12,338.32,516.28,217.59,7.86;12,316.81,525.90,239.11,7.86;12,316.81,535.52,239.11,7.86;12,316.81,545.15,49.47,7.86">Using a decentralized design, rebalancing mechanism, chunking, and logical blob grouping, we provide load balancing and horizontal scalability to meet the rapid growth at LinkedIn.</s></p><p><s coords="12,325.78,554.77,230.14,7.86;12,316.81,564.40,239.11,7.86;12,316.81,574.02,168.84,7.86">As part of future work we plan to adaptively change the replication factor of data based on the popularity, and use erasure coding mechanisms for cold data.</s><s coords="12,491.35,574.02,64.57,7.86;12,316.81,583.64,239.11,7.86;12,316.81,593.27,33.54,7.86">We also plan to investigate using compression mechanisms and its costs and benefits.</s><s coords="12,358.90,593.27,197.02,7.86;12,316.81,602.89,227.15,7.86">Additionally, we are working on improving the security of Ambry, especially for cross-datacenter traffic.</s></p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="9." coords="12,316.81,624.68,152.51,10.75">ACKNOWLEDGMENTS</head><p><s coords="12,325.78,638.08,230.14,7.86;12,316.81,647.70,239.10,7.86;12,316.81,657.33,239.11,7.86;12,316.81,666.95,239.10,7.86;12,316.81,676.57,239.11,7.86;12,316.81,686.20,212.42,7.86">We wish to thank the following people for their invaluable contributions towards the development and deployment of Ambry: our site reliability engineers, Tofig Suleymanov, Arjun Shenoy and Dmitry Nikiforov; our alumni Jay Wylie; our interns Tong Wei and Sarath Sreedharan; and our newest member Ming Xia for his valuable review comments.</s></p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0" coords="2,351.92,365.88,168.89,7.89"><head>Figure 2 :</head><label>2</label><figDesc><div><p><s coords="2,351.92,365.88,168.89,7.89">Figure 2: Partition and Blob layout.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1" coords="3,336.15,213.50,200.44,7.89"><head>4 Figure 3 :</head><label>43</label><figDesc><div><p><s coords="3,336.15,213.50,200.44,7.89">Figure 3: Steps in processing an operation.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2" coords="4,53.80,387.48,153.06,7.89;4,56.80,399.53,99.84,7.86;4,56.80,408.49,144.86,7.86;4,56.80,417.46,142.07,7.86;4,56.80,426.43,135.10,7.86;4,56.80,444.36,214.99,7.86;4,56.80,453.33,83.37,7.86;4,56.80,462.29,80.30,7.86;4,56.80,471.26,7.16,7.86;4,80.88,471.94,134.32,6.99;4,56.80,480.23,7.16,7.86;4,80.88,480.88,123.80,7.01;4,53.80,489.19,11.76,7.86;4,95.18,489.87,179.76,6.99;4,53.80,500.15,11.76,7.86;4,82.48,500.83,131.25,6.99;4,53.80,509.12,11.76,7.86;4,82.48,509.77,203.35,7.01;4,53.80,518.08,11.76,7.86;4,95.18,518.76,150.71,6.99;4,53.80,538.01,221.12,7.86;4,53.80,546.98,115.63,7.86;4,53.80,555.94,112.56,7.86;4,53.80,573.87,149.85,8.12;4,53.80,582.84,11.76,7.86;4,82.48,583.50,186.32,7.01;4,53.80,591.81,11.76,7.86;4,95.18,592.49,103.95,6.99;4,53.80,600.77,11.76,7.86;4,95.18,601.43,160.06,7.01;4,53.80,609.74,11.76,7.86;4,107.87,610.42,64.91,6.99;4,53.80,618.71,11.76,7.86;4,107.87,619.39,90.59,6.99"><head>Algorithm 1 9 :</head><label>19</label><figDesc><div><p><s coords="4,113.41,387.50,93.46,7.86;4,56.80,399.53,99.84,7.86">Rebalancing Algorithm 1: // Compute ideal state.</s><s coords="4,56.80,408.49,144.86,7.86;4,56.80,417.46,142.07,7.86;4,56.80,426.43,135.10,7.86;4,56.80,444.36,214.99,7.86">2: idealRW=totalNumRW / numDisks 3: idealRO=totalNumRO / numDisks 4: idealUsed=totalUsed / numDisks 5: // Phase1: move extra partitions into a partition pool.</s><s coords="4,56.80,453.33,83.37,7.86;4,56.80,462.29,80.30,7.86;4,56.80,471.26,7.16,7.86;4,80.88,471.94,134.32,6.99">6: partitionP ool = {} 7: for each disk d do 8: // Move extra read-write partitions.</s><s coords="4,80.88,480.88,123.80,7.01;4,53.80,489.19,11.76,7.86;4,95.18,489.87,179.76,6.99;4,53.80,500.15,11.76,7.86;4,82.48,500.83,131.25,6.99">while d.N umRW &gt; idealRW do 10: partitionP ool += chooseM inimumU sedRW (d) 11: // Move extra read-only partitions.</s><s coords="4,53.80,509.12,11.76,7.86;4,82.48,509.77,203.35,7.01;4,53.80,518.08,11.76,7.86;4,95.18,518.76,150.71,6.99;4,53.80,538.01,221.12,7.86">12: while d.N umRO &gt; idealRO &amp; d.used &gt; idealUsed do 13: partitionP ool += chooseRandomRO(d) 14: // Phase2: Move partitions to disks needing partitions.</s><s coords="4,53.80,546.98,115.63,7.86;4,53.80,555.94,112.56,7.86;4,53.80,573.87,149.85,8.12;4,53.80,582.84,11.76,7.86;4,82.48,583.50,186.32,7.01;4,53.80,591.81,11.76,7.86;4,95.18,601.43,160.06,7.01;4,53.80,609.74,11.76,7.86;4,107.87,610.42,64.91,6.99;4,53.80,618.71,11.76,7.86;4,107.87,619.39,90.59,6.99">15: placePartitions(read-write) 16: placePartitions(read-only) 17: function placePartitions(Type t) 18: while partitionP ool contains partitions type t do 19: for disk d in D and partition p in pool do 21: d.addPartition(p) 22: partitionP ool.remove(p)</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4" coords="5,316.81,90.33,239.11,7.89;5,316.81,100.79,68.54,7.89"><head>Figure 4 :</head><label>4</label><figDesc><div><p><s coords="5,316.81,90.33,239.11,7.89;5,316.81,100.79,68.54,7.89">Figure 4: Content of the metadata blob used for chunked blobs.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5" coords="5,316.81,190.11,239.11,7.89;5,316.81,200.57,225.27,7.89"><head>Figure 5 :</head><label>5</label><figDesc><div><p><s coords="5,316.81,190.11,239.11,7.89;5,316.81,200.57,225.27,7.89">Figure 5: Failure detection algorithm with maximum tolerance of 2 consecutive failed responses.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_6" coords="6,316.81,254.65,239.10,7.89;6,316.81,265.11,239.11,7.89;6,316.81,275.57,200.46,7.89"><head>Figure 6 :</head><label>6</label><figDesc><div><p><s coords="6,316.81,254.65,239.10,7.89;6,316.81,265.11,33.82,7.89">Figure 6: Indexing of blob offsets in a partition replica.</s><s coords="6,358.87,265.11,197.05,7.89;6,316.81,275.57,200.46,7.89">When blobs are put (blob 60) or deleted (blob 20) the indexing stucture is updated.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_8" coords="7,53.80,179.03,239.11,7.89;7,53.80,189.49,236.52,7.89"><head>Figure 7 :</head><label>7</label><figDesc><div><p><s coords="7,53.80,179.03,239.11,7.89;7,53.80,189.49,236.52,7.89">Figure 7: Journals for two replicas of the same partition and an example of the replication algorithm.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_9" coords="8,82.08,196.06,3.66,6.31;8,78.41,178.85,7.32,6.31;8,78.41,161.63,7.32,6.31;8,78.41,144.42,7.32,6.31;8,78.41,127.16,7.32,6.31;8,74.75,109.95,10.98,6.31;8,74.75,92.74,10.98,6.31;8,74.75,75.52,10.98,6.31;8,88.77,202.64,3.66,6.31;8,136.98,202.64,3.66,6.31;8,183.40,202.64,7.32,6.31;8,231.60,202.64,7.32,6.31;8,279.81,202.64,7.32,6.31;8,60.01,132.27,6.31,33.67;8,60.01,111.05,6.31,19.38;8,159.75,212.52,52.69,6.31;8,93.30,54.08,31.09,6.31;8,87.08,60.66,37.32,6.31;8,90.74,67.25,33.66,6.31;8,163.05,54.08,31.47,6.31;8,156.82,60.66,37.69,6.31;8,160.48,67.25,34.03,6.31;8,228.05,54.08,36.58,6.31;8,141.66,223.21,63.38,7.86;8,80.50,384.50,3.65,6.30;8,75.02,368.63,9.13,6.30;8,75.02,352.77,9.13,6.30;8,75.02,336.91,9.13,6.30;8,75.02,321.05,9.13,6.30;8,75.02,305.23,9.13,6.30;8,75.02,289.37,9.13,6.30;8,75.02,273.50,9.13,6.30;8,75.02,257.64,9.13,6.30;8,87.18,391.07,3.65,6.30;8,106.42,391.07,3.65,6.30;8,125.66,391.07,3.65,6.30;8,144.90,391.07,3.65,6.30;8,164.14,391.07,120.98,6.30;8,58.48,327.06,6.30,33.22;8,58.48,302.22,6.30,23.01;8,58.48,287.26,6.30,13.13;8,158.01,400.92,52.58,6.30;8,92.06,242.81,30.66,6.30;8,85.85,249.38,36.88,6.30;8,159.48,242.81,33.23,6.30;8,161.66,249.38,31.04,6.30;8,225.42,242.81,37.25,6.30;8,229.07,249.38,33.60,6.30;8,99.25,411.60,145.13,7.86"><head></head><label></label><figDesc><div><p><s coords="8,114.55,411.60,129.83,7.86">Latency normalized by blob size</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_10" coords="8,53.80,435.94,239.11,7.89;8,53.80,446.40,239.10,7.89;8,53.80,456.86,239.11,7.89;8,53.80,467.32,104.89,7.89"><head>Figure 8 :</head><label>8</label><figDesc><div><p><s coords="8,53.80,435.94,239.11,7.89;8,53.80,446.40,239.10,7.89;8,53.80,456.86,47.74,7.89">Figure 8: Throughput and latency of read and write requests with varying number of clients on different blob sizes.</s><s coords="8,107.36,456.86,185.55,7.89;8,53.80,467.32,104.89,7.89">These results were gathered on a single Datanode deployment.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_11" coords="9,53.80,463.24,239.11,7.89;9,53.80,473.70,239.11,7.89;9,53.80,484.16,239.11,7.89;9,53.80,494.62,76.53,7.89"><head>Figure 10 :</head><label>10</label><figDesc><div><p><s coords="9,53.80,463.24,239.11,7.89;9,53.80,473.70,138.96,7.89">Figure 10: CDF of read and write latency with 2 clients for various blob sizes.</s><s coords="9,201.87,473.70,91.04,7.89;9,53.80,484.16,239.11,7.89;9,53.80,494.62,76.53,7.89">Read-Write falls in the middle with change around 0.5 due to the 50-50 mixed workload.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_12" coords="9,316.81,443.62,239.11,7.89;9,316.81,454.08,239.11,7.89;9,316.81,464.54,239.11,7.89;9,316.81,475.00,29.56,7.89"><head>Figure 11 :</head><label>11</label><figDesc><div><p><s coords="9,316.81,443.62,239.11,7.89;9,316.81,454.08,239.11,7.89">Figure 11: CDF of Replication Lag among replica pairs of a given Datanode and the rest of the cluster.</s><s coords="9,316.81,464.54,239.11,7.89;9,316.81,475.00,29.56,7.89">Most values were zero, and are omitted from the graph.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_13" coords="10,53.80,211.70,239.10,7.89;10,53.80,222.16,239.10,7.89;10,53.80,232.62,65.91,7.89"><head>Figure 12 :Figure 13 :</head><label>1213</label><figDesc><div><p><s coords="10,53.80,211.70,239.10,7.89;10,53.80,222.16,239.10,7.89;10,53.80,232.62,65.91,7.89">Figure 12: Aggregate network bandwidth used for inter-datacenter replication during a 24 hour period in production.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_14" coords="10,316.81,209.40,239.11,7.89;10,316.81,219.87,239.11,7.89;10,316.81,230.33,65.91,7.89"><head>Figure 14 :</head><label>14</label><figDesc><div><p><s coords="10,316.81,209.40,239.11,7.89;10,316.81,219.87,239.11,7.89;10,316.81,230.33,65.91,7.89">Figure 14: CDF of average inter-datacenter network bandwidth used per Datanode over a 24 hour period in production.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_15" coords="10,316.81,415.05,239.11,7.89;10,316.81,425.51,239.10,7.89;10,316.81,435.97,205.26,7.89"><head>Figure 15 :</head><label>15</label><figDesc><div><p><s coords="10,316.81,415.05,239.11,7.89;10,316.81,425.51,239.10,7.89;10,316.81,435.97,205.26,7.89">Figure 15: CDF of average replication latency (i.e., time spent to recieve missing blobs) for intra-and inter-datacenter in production environment.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_17" coords="11,53.80,386.30,239.11,7.89;11,53.80,396.76,239.11,7.89;11,53.80,407.22,239.11,7.89;11,53.80,417.68,239.11,7.89;11,53.80,428.14,39.03,7.89"><head>Figure 16 :</head><label>16</label><figDesc><div><p><s coords="11,53.80,386.30,239.11,7.89;11,53.80,396.76,239.11,7.89;11,53.80,407.22,141.86,7.89">Figure16: Average, standard deviation, maximum and minimum of average read rate (KB/s) among disks over a 400 week interval.</s><s coords="11,200.46,407.22,92.45,7.89;11,53.80,417.68,239.11,7.89;11,53.80,428.14,39.03,7.89">The system is bootstrapping in the first few weeks, and the results are omitted.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_19" coords="11,316.81,393.65,239.11,7.89;11,316.81,404.11,239.12,7.89;11,316.81,414.57,225.29,7.89"><head>Figure 17 :</head><label>17</label><figDesc><div><p><s coords="11,316.81,393.65,239.11,7.89;11,316.81,404.11,239.12,7.89;11,316.81,414.57,225.29,7.89">Figure17: Average, standard deviation, maximum and minimum of disk usage ratio (i.e., used space divided by total space) over a 400 week interval.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_20" coords="12,53.80,354.69,239.11,7.89;12,53.80,365.15,239.11,7.89;12,53.80,375.61,221.15,7.89"><head>Figure 18 :</head><label>18</label><figDesc><div><p><s coords="12,53.80,354.69,239.11,7.89;12,53.80,365.15,239.11,7.89;12,53.80,375.61,221.15,7.89">Figure 18: Data movement of the rebalancing algorithm at each rebalancing point (i.e., whenever new Datanodes are added) over a 400 week interval.</s></p></div></figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1" coords="4,328.14,55.11,216.45,119.20"><head>Table 1 :</head><label>1</label><figDesc><div><p><s coords="4,369.97,166.41,174.63,7.89">Hardware layout in Cluster Manager.</s></p></div></figDesc><table coords="4,330.71,55.11,211.30,98.67"><row><cell>Datacenter</cell><cell>Datanode</cell><cell>Disk</cell><cell cols="2">Size Status</cell></row><row><cell></cell><cell></cell><cell>disk 1</cell><cell>4 TB</cell><cell>UP</cell></row><row><cell>DC 1</cell><cell>Datanode 1</cell><cell>... disk k</cell><cell>... 4 TB</cell><cell>... UP</cell></row><row><cell></cell><cell></cell><cell>disk 1</cell><cell cols="2">4 TB DOWN</cell></row><row><cell>DC 1</cell><cell>Datanode 2</cell><cell>...</cell><cell>...</cell><cell>...</cell></row><row><cell></cell><cell></cell><cell>disk k</cell><cell>4 TB</cell><cell>UP</cell></row><row><cell></cell><cell>...</cell><cell>...</cell><cell>...</cell><cell>...</cell></row><row><cell></cell><cell></cell><cell>disk 1</cell><cell cols="2">1 TB DOWN</cell></row><row><cell>DC n</cell><cell>Datanode j</cell><cell>...</cell><cell>...</cell><cell>...</cell></row><row><cell></cell><cell></cell><cell>disk k</cell><cell>1 TB</cell><cell>UP</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_3" coords="5,53.80,147.98,239.11,104.88"><head>Table 2 :</head><label>2</label><figDesc><div><p><s coords="5,111.62,147.98,165.29,7.89;5,53.80,177.64,239.11,7.86">Logical Layout in Cluster Manager.operations).An example of this layout is shown in Table2.</s><s coords="5,53.80,187.26,239.10,7.86;5,53.80,196.88,239.11,7.86;5,53.80,206.51,17.47,7.86">As shown, replicas of a partition can be placed on multiple Datanodes in one datacenter, and/or in different datacenters.</s><s coords="5,76.70,206.51,216.20,7.86;5,53.80,216.13,239.10,7.86;5,53.80,225.76,140.20,7.86">Additionally, one disk (e.g., DC 1: Datanode 1: disk 1) can contain replicas of distinct partitions, where some are read-only and some are read-write.</s><s coords="5,198.06,225.76,94.84,7.86;5,53.80,235.38,239.11,7.86;5,53.80,245.00,36.40,7.86;5,90.20,243.24,3.65,5.24;5,94.35,245.00,2.56,7.86">Partitions are added by updating the logical layout stored in the Cluster Manager instances 2 .</s></p></div></figDesc><table /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_5" coords="8,316.81,103.15,239.11,39.27"><head>Table 3 :</head><label>3</label><figDesc><div><p><s coords="8,361.21,103.15,194.70,7.89;8,316.81,113.61,239.11,7.89;8,316.81,124.07,239.11,7.89;8,316.81,134.53,182.87,7.89">Comparision of get request latency when most of requests (83%) are served by disk (Disk Reads) and when all requests are served by linux cache (Cached Reads) for 50 KB blobs.</s></p></div></figDesc><table /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_7" coords="12,53.80,141.55,239.11,198.84"><head>Table 4</head><label>4</label><figDesc></figDesc><table coords="12,53.80,141.55,239.11,198.84"><row><cell></cell><cell cols="8">: Improvement of range (max-min) and stan-</cell></row><row><cell cols="9">dard deviation of request rates and disk usage over</cell></row><row><cell cols="9">a 400 week interval. Results are from the system</cell></row><row><cell cols="9">with rebalancing (w/ RB) and without rebalancing</cell></row><row><cell cols="2">(w/o RB).</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell>450</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell>400</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell>Data Movement (TB)</cell><cell>200 250 300 350</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell>150</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell>Minimum</cell><cell></cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell>Rebalancer</cell><cell></cell></row><row><cell></cell><cell>100</cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell><cell></cell></row><row><cell></cell><cell>0</cell><cell>2</cell><cell>4</cell><cell>6</cell><cell>8</cell><cell>10</cell><cell>12</cell><cell>14</cell></row><row><cell></cell><cell></cell><cell></cell><cell></cell><cell cols="2">Rebalancing Points</cell><cell></cell><cell></cell></row></table></figure>
<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="2" xml:id="foot_0">Currently, the system administrator manually adds partitions in order to prevent unwanted and rapid cluster growths. However, this can easily be automated.</note>
<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="3" xml:id="foot_1">Chunk size is not fixed and can be adapted to follow the growth in blob sizes, improvements in network, etc.</note>
<note xmlns="http://www.tei-c.org/ns/1.0" place="foot" n="4" xml:id="foot_2">The random strategy gives a lower bound on the system's performance since real-world access patterns are skewed toward recent data.</note>
</body>
<back>
<div type="annex">
<div xmlns="http://www.tei-c.org/ns/1.0"><p><s coords="9,53.80,249.80,379.21,7.89">Results were gathered on a write-only, read-only, and mixed (50%-50%) workload.</s><s coords="9,437.57,249.80,118.36,7.89;9,53.80,260.26,462.21,7.89">Reads for small blob sizes (&lt;200 KB) are slowed down by frequent disk seeks, while other requests saturate the network link.</s></p></div> </div>
<div type="references">
<listBibl>
<biblStruct coords="13,72.59,68.67,204.44,7.86;13,72.59,78.29,108.25,7.86" xml:id="b0">
<analytic>
<title level="a" type="main">Wade, Bonnie C(laire)</title>
<author>
<persName><forename type="first">George</forename><surname>Ruckert</surname></persName>
</author>
<idno type="DOI">10.1093/gmo/9781561592630.article.48764</idno>
<ptr target="http://www.coker.com.au/bonnie++/" />
</analytic>
<monogr>
<title level="m">Bonnie++</title>
<imprint>
<publisher>Oxford University Press</publisher>
<date type="published" when="2001-03">2001. accessed Mar, 2016</date>
</imprint>
</monogr>
<note type="raw_reference">Bonnie++. http://www.coker.com.au/bonnie++/, 2001 (accessed Mar, 2016).</note>
</biblStruct>
<biblStruct coords="13,72.59,88.91,185.97,7.86;13,72.59,98.54,165.95,7.86;13,72.59,108.16,189.31,7.86;13,72.59,117.78,202.35,7.86;13,72.59,127.41,191.88,7.86" xml:id="b1">
<analytic>
<title level="a" type="main">Data Infrastructure at LinkedIn</title>
<author>
<persName><forename type="first">Aditya</forename><surname>Auradkar</surname></persName>
</author>
<author>
<persName><forename type="first">Chavdar</forename><surname>Botev</surname></persName>
</author>
<author>
<persName><forename type="first">Shirshanka</forename><surname>Das</surname></persName>
</author>
<author>
<persName><forename type="first">Dave</forename><surname>De Maagd</surname></persName>
</author>
<author>
<persName><forename type="first">Alex</forename><surname>Feinberg</surname></persName>
</author>
<author>
<persName><forename type="first">Phanindra</forename><surname>Ganti</surname></persName>
</author>
<author>
<persName><forename type="first">Lei</forename><surname>Gao</surname></persName>
</author>
<author>
<persName><forename type="first">Bhaskar</forename><surname>Ghosh</surname></persName>
</author>
<author>
<persName><forename type="first">Kishore</forename><surname>Gopalakrishna</surname></persName>
</author>
<author>
<persName><forename type="first">Brendan</forename><surname>Harris</surname></persName>
</author>
<author>
<persName><forename type="first">Joel</forename><surname>Koshy</surname></persName>
</author>
<author>
<persName><forename type="first">Kevin</forename><surname>Krawez</surname></persName>
</author>
<author>
<persName><forename type="first">Jay</forename><surname>Kreps</surname></persName>
</author>
<author>
<persName><forename type="first">Shi</forename><surname>Lu</surname></persName>
</author>
<author>
<persName><forename type="first">Sunil</forename><surname>Nagaraj</surname></persName>
</author>
<author>
<persName><forename type="first">Neha</forename><surname>Narkhede</surname></persName>
</author>
<author>
<persName><forename type="first">Sasha</forename><surname>Pachev</surname></persName>
</author>
<author>
<persName><forename type="first">Igor</forename><surname>Perisic</surname></persName>
</author>
<author>
<persName><forename type="first">Lin</forename><surname>Qiao</surname></persName>
</author>
<author>
<persName><forename type="first">Tom</forename><surname>Quiggle</surname></persName>
</author>
<author>
<persName><forename type="first">Jun</forename><surname>Rao</surname></persName>
</author>
<author>
<persName><forename type="first">Bob</forename><surname>Schulman</surname></persName>
</author>
<author>
<persName><forename type="first">Abraham</forename><surname>Sebastian</surname></persName>
</author>
<author>
<persName><forename type="first">Oliver</forename><surname>Seeliger</surname></persName>
</author>
<author>
<persName><forename type="first">Adam</forename><surname>Silberstein</surname></persName>
</author>
<author>
<persName><forename type="first">Bboris</forename><surname>Shkolnik</surname></persName>
</author>
<author>
<persName><forename type="first">Chinmay</forename><surname>Soman</surname></persName>
</author>
<author>
<persName><forename type="first">Roshan</forename><surname>Sumbaly</surname></persName>
</author>
<author>
<persName><forename type="first">Kapil</forename><surname>Surlaker</surname></persName>
</author>
<author>
<persName><forename type="first">Sajid</forename><surname>Topiwala</surname></persName>
</author>
<author>
<persName><forename type="first">Cuong</forename><surname>Tran</surname></persName>
</author>
<author>
<persName><forename type="first">Balaji</forename><surname>Varadarajan</surname></persName>
</author>
<author>
<persName><forename type="first">Jemiah</forename><surname>Westerman</surname></persName>
</author>
<author>
<persName><forename type="first">Zach</forename><surname>White</surname></persName>
</author>
<author>
<persName><forename type="first">David</forename><surname>Zhang</surname></persName>
</author>
<author>
<persName><forename type="first">Jason</forename><surname>Zhang</surname></persName>
</author>
<idno type="DOI">10.1109/icde.2012.147</idno>
</analytic>
<monogr>
<title level="m">2012 IEEE 28th International Conference on Data Engineering</title>
<imprint>
<publisher>IEEE</publisher>
<date type="published" when="2012-04">2012</date>
</imprint>
</monogr>
<note type="raw_reference">A. Auradkar, C. Botev, S. Das, D. De Maagd, A. Feinberg, P. Ganti, L. Gao, B. Ghosh, K. Gopalakrishna, et al. Data infrastructure at LinkedIn. In Proceeding of the IEEE International Conference on Data Engineering (ICDE), 2012.</note>
</biblStruct>
<biblStruct coords="13,72.59,138.03,181.33,7.86;13,72.59,147.65,209.88,7.86;13,72.59,157.28,178.82,7.86;13,72.59,166.90,184.05,7.86;13,72.59,176.52,122.88,7.86" xml:id="b2">
<analytic>
<title level="a" type="main">OSDI (9th Usenix Symposium on Operating Systems Design and Implementation) advertisement</title>
<author>
<persName coords=""><forename type="first">D</forename><surname>Beaver</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">S</forename><surname>Kumar</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">H</forename><forename type="middle">C</forename><surname>Li</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">J</forename><surname>Sobel</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">P</forename><surname>Vajgel</surname></persName>
</author>
<idno type="DOI">10.1109/msp.2010.134</idno>
</analytic>
<monogr>
<title level="j">IEEE Security &amp; Privacy Magazine</title>
<title level="j" type="abbrev">IEEE Secur. Privacy Mag.</title>
<idno type="ISSN">1540-7993</idno>
<imprint>
<biblScope unit="volume">8</biblScope>
<biblScope unit="issue">4</biblScope>
<biblScope unit="page" from="c4" to="c4" />
<date type="published" when="2010-07">2010</date>
<publisher>Institute of Electrical and Electronics Engineers (IEEE)</publisher>
</imprint>
</monogr>
<note type="raw_reference">D. Beaver, S. Kumar, H. C. Li, J. Sobel, and P. Vajgel. Finding a needle in Haystack: Facebook&apos;s photo storage. In Proceeding of the USENIX Symposium on Operating Systems Design and Implementation (OSDI), 2010.</note>
</biblStruct>
<biblStruct coords="13,72.59,187.14,180.83,7.86;13,72.59,196.77,192.89,7.86;13,72.59,206.39,190.24,7.86;13,72.59,216.01,220.32,7.86;13,72.59,225.64,208.17,7.86;13,72.59,235.26,136.47,7.86" xml:id="b3">
<analytic>
<title level="a" type="main">Windows Azure Storage</title>
<author>
<persName><forename type="first">Brad</forename><surname>Calder</surname></persName>
</author>
<author>
<persName><forename type="first">Ju</forename><surname>Wang</surname></persName>
</author>
<author>
<persName><forename type="first">Aaron</forename><surname>Ogus</surname></persName>
</author>
<author>
<persName><forename type="first">Niranjan</forename><surname>Nilakantan</surname></persName>
</author>
<author>
<persName><forename type="first">Arild</forename><surname>Skjolsvold</surname></persName>
</author>
<author>
<persName><forename type="first">Sam</forename><surname>Mckelvie</surname></persName>
</author>
<author>
<persName><forename type="first">Yikang</forename><surname>Xu</surname></persName>
</author>
<author>
<persName><forename type="first">Shashwat</forename><surname>Srivastav</surname></persName>
</author>
<author>
<persName><forename type="first">Jiesheng</forename><surname>Wu</surname></persName>
</author>
<author>
<persName><forename type="first">Huseyin</forename><surname>Simitci</surname></persName>
</author>
<author>
<persName><forename type="first">Jaidev</forename><surname>Haridas</surname></persName>
</author>
<author>
<persName><forename type="first">Chakravarthy</forename><surname>Uddaraju</surname></persName>
</author>
<author>
<persName><forename type="first">Hemal</forename><surname>Khatri</surname></persName>
</author>
<author>
<persName><forename type="first">Andrew</forename><surname>Edwards</surname></persName>
</author>
<author>
<persName><forename type="first">Vaman</forename><surname>Bedekar</surname></persName>
</author>
<author>
<persName><forename type="first">Shane</forename><surname>Mainali</surname></persName>
</author>
<author>
<persName><forename type="first">Rafay</forename><surname>Abbasi</surname></persName>
</author>
<author>
<persName><forename type="first">Arpit</forename><surname>Agarwal</surname></persName>
</author>
<author>
<persName><forename type="first">Mian</forename><forename type="middle">Fahim Ul</forename><surname>Haq</surname></persName>
</author>
<author>
<persName><forename type="first">Muhammad</forename><forename type="middle">Ikram Ul</forename><surname>Haq</surname></persName>
</author>
<author>
<persName><forename type="first">Deepali</forename><surname>Bhardwaj</surname></persName>
</author>
<author>
<persName><forename type="first">Sowmya</forename><surname>Dayanand</surname></persName>
</author>
<author>
<persName><forename type="first">Anitha</forename><surname>Adusumilli</surname></persName>
</author>
<author>
<persName><forename type="first">Marvin</forename><surname>Mcnett</surname></persName>
</author>
<author>
<persName><forename type="first">Sriram</forename><surname>Sankaran</surname></persName>
</author>
<author>
<persName><forename type="first">Kavitha</forename><surname>Manivannan</surname></persName>
</author>
<author>
<persName><forename type="first">Leonidas</forename><surname>Rigas</surname></persName>
</author>
<idno type="DOI">10.1145/2043556.2043571</idno>
</analytic>
<monogr>
<title level="m">Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles</title>
<meeting>the Twenty-Third ACM Symposium on Operating Systems Principles</meeting>
<imprint>
<publisher>ACM</publisher>
<date type="published" when="2011-10-23">2011</date>
</imprint>
</monogr>
<note type="raw_reference">B. Calder, J. Wang, A. Ogus, N. Nilakantan, A. Skjolsvold, S. McKelvie, Y. Xu, S. Srivastav, J. Wu, et al. Windows Azure storage: A highly available cloud storage service with strong consistency. In Proceeding of the ACM Symposium on Operating Systems Principles (SOSP), 2011.</note>
</biblStruct>
<biblStruct coords="13,72.59,245.88,216.95,7.86;13,72.59,255.51,220.31,7.86;13,72.59,265.13,205.30,7.86;13,72.59,274.75,200.22,7.86;13,72.59,284.38,121.32,7.86" xml:id="b4">
<analytic>
<title level="a" type="main">Bigtable</title>
<author>
<persName><forename type="first">Fay</forename><surname>Chang</surname></persName>
</author>
<author>
<persName><forename type="first">Jeffrey</forename><surname>Dean</surname></persName>
</author>
<author>
<persName><forename type="first">Sanjay</forename><surname>Ghemawat</surname></persName>
</author>
<author>
<persName><forename type="first">Wilson</forename><forename type="middle">C</forename><surname>Hsieh</surname></persName>
</author>
<author>
<persName><forename type="first">Deborah</forename><forename type="middle">A</forename><surname>Wallach</surname></persName>
</author>
<author>
<persName><forename type="first">Mike</forename><surname>Burrows</surname></persName>
</author>
<author>
<persName><forename type="first">Tushar</forename><surname>Chandra</surname></persName>
</author>
<author>
<persName><forename type="first">Andrew</forename><surname>Fikes</surname></persName>
</author>
<author>
<persName><forename type="first">Robert</forename><forename type="middle">E</forename><surname>Gruber</surname></persName>
</author>
<idno type="DOI">10.1145/1365815.1365816</idno>
</analytic>
<monogr>
<title level="j">ACM Transactions on Computer Systems</title>
<title level="j" type="abbrev">ACM Trans. Comput. Syst.</title>
<idno type="ISSN">0734-2071</idno>
<idno type="ISSNe">1557-7333</idno>
<imprint>
<biblScope unit="volume">26</biblScope>
<biblScope unit="issue">2</biblScope>
<biblScope unit="page" from="1" to="26" />
<date type="published" when="2008-06">2008</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">F. Chang, J. Dean, S. Ghemawat, W. C. Hsieh, D. A. Wallach, M. Burrows, T. Chandra, A. Fikes, and R. E. Gruber. Bigtable: A distributed storage system for structured data. ACM Transactions on Computer Systems (TOCS), 26(2), 2008.</note>
</biblStruct>
<biblStruct coords="13,72.59,295.00,190.62,7.86;13,72.59,304.62,202.58,7.86;13,72.59,314.25,207.71,7.86;13,72.59,323.87,219.98,7.86;13,72.59,333.49,181.19,7.86" xml:id="b5">
<analytic>
<title level="a" type="main">PNUTS</title>
<author>
<persName><forename type="first">Brian</forename><forename type="middle">F</forename><surname>Cooper</surname></persName>
</author>
<author>
<persName><forename type="first">Raghu</forename><surname>Ramakrishnan</surname></persName>
</author>
<author>
<persName><forename type="first">Utkarsh</forename><surname>Srivastava</surname></persName>
</author>
<author>
<persName><forename type="first">Adam</forename><surname>Silberstein</surname></persName>
</author>
<author>
<persName><forename type="first">Philip</forename><surname>Bohannon</surname></persName>
</author>
<author>
<persName><forename type="first">Hans-Arno</forename><surname>Jacobsen</surname></persName>
</author>
<author>
<persName><forename type="first">Nick</forename><surname>Puz</surname></persName>
</author>
<author>
<persName><forename type="first">Daniel</forename><surname>Weaver</surname></persName>
</author>
<author>
<persName><forename type="first">Ramana</forename><surname>Yerneni</surname></persName>
</author>
<idno type="DOI">10.14778/1454159.1454167</idno>
</analytic>
<monogr>
<title level="j">Proceedings of the VLDB Endowment</title>
<title level="j" type="abbrev">Proc. VLDB Endow.</title>
<idno type="ISSN">2150-8097</idno>
<imprint>
<biblScope unit="volume">1</biblScope>
<biblScope unit="issue">2</biblScope>
<biblScope unit="page" from="1277" to="1288" />
<date type="published" when="2008-08">2008</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">B. F. Cooper, R. Ramakrishnan, U. Srivastava, A. Silberstein, P. Bohannon, A. Jacobsen, N. Puz, D. Weaver, and R. Yerneni. Pnuts: Yahoo!&apos;s hosted data serving platform. In Proceeding of the Very Large Data Bases Endowment (VLDB), 1(2), 2008.</note>
</biblStruct>
<biblStruct coords="13,72.59,344.11,220.31,7.86;13,72.59,353.74,209.55,7.86;13,72.59,363.36,219.59,7.86;13,72.59,372.99,220.31,7.86;13,72.59,382.61,206.45,7.86" xml:id="b6">
<analytic>
<title level="a" type="main">Spanner</title>
<author>
<persName><forename type="first">James</forename><forename type="middle">C</forename><surname>Corbett</surname></persName>
</author>
<author>
<persName><forename type="first">Peter</forename><surname>Hochschild</surname></persName>
</author>
<author>
<persName><forename type="first">Wilson</forename><surname>Hsieh</surname></persName>
</author>
<author>
<persName><forename type="first">Sebastian</forename><surname>Kanthak</surname></persName>
</author>
<author>
<persName><forename type="first">Eugene</forename><surname>Kogan</surname></persName>
</author>
<author>
<persName><forename type="first">Hongyi</forename><surname>Li</surname></persName>
</author>
<author>
<persName><forename type="first">Alexander</forename><surname>Lloyd</surname></persName>
</author>
<author>
<persName><forename type="first">Sergey</forename><surname>Melnik</surname></persName>
</author>
<author>
<persName><forename type="first">David</forename><surname>Mwaura</surname></persName>
</author>
<author>
<persName><forename type="first">David</forename><surname>Nagle</surname></persName>
</author>
<author>
<persName><forename type="first">Sean</forename><surname>Quinlan</surname></persName>
</author>
<author>
<persName><forename type="first">Jeffrey</forename><surname>Dean</surname></persName>
</author>
<author>
<persName><forename type="first">Rajesh</forename><surname>Rao</surname></persName>
</author>
<author>
<persName><forename type="first">Lindsay</forename><surname>Rolig</surname></persName>
</author>
<author>
<persName><forename type="first">Yasushi</forename><surname>Saito</surname></persName>
</author>
<author>
<persName><forename type="first">Michal</forename><surname>Szymaniak</surname></persName>
</author>
<author>
<persName><forename type="first">Christopher</forename><surname>Taylor</surname></persName>
</author>
<author>
<persName><forename type="first">Ruth</forename><surname>Wang</surname></persName>
</author>
<author>
<persName><forename type="first">Dale</forename><surname>Woodford</surname></persName>
</author>
<author>
<persName><forename type="first">Michael</forename><surname>Epstein</surname></persName>
</author>
<author>
<persName><forename type="first">Andrew</forename><surname>Fikes</surname></persName>
</author>
<author>
<persName><forename type="first">Christopher</forename><surname>Frost</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">J</forename><forename type="middle">J</forename><surname>Furman</surname></persName>
</author>
<author>
<persName><forename type="first">Sanjay</forename><surname>Ghemawat</surname></persName>
</author>
<author>
<persName><forename type="first">Andrey</forename><surname>Gubarev</surname></persName>
</author>
<author>
<persName><forename type="first">Christopher</forename><surname>Heiser</surname></persName>
</author>
<idno type="DOI">10.1145/2518037.2491245</idno>
</analytic>
<monogr>
<title level="j">ACM Transactions on Computer Systems</title>
<title level="j" type="abbrev">ACM Trans. Comput. Syst.</title>
<idno type="ISSN">0734-2071</idno>
<imprint>
<biblScope unit="volume">31</biblScope>
<biblScope unit="issue">3</biblScope>
<biblScope unit="page" from="1" to="22" />
<date type="published" when="2012">2012</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">J. C. Corbett, J. Dean, M. Epstein, A. Fikes, C. Frost, J. J. Furman, S. Ghemawat, A. Gubarev, C. Heiser, et al. Spanner: Google&apos;s globally-distributed database. In Proceeding of the USENIX Symposium on Operating Systems Design and Implementation (OSDI), 2012.</note>
</biblStruct>
<biblStruct coords="13,72.59,393.23,164.66,7.86;13,72.59,402.85,165.77,7.86;13,72.59,412.48,195.42,7.86;13,72.59,422.10,210.94,7.86;13,72.59,431.73,219.45,7.86;13,72.59,441.35,84.19,7.86" xml:id="b7">
<analytic>
<title level="a" type="main">Dynamo</title>
<author>
<persName><forename type="first">Giuseppe</forename><surname>Decandia</surname></persName>
</author>
<author>
<persName><forename type="first">Deniz</forename><surname>Hastorun</surname></persName>
</author>
<author>
<persName><forename type="first">Madan</forename><surname>Jampani</surname></persName>
</author>
<author>
<persName><forename type="first">Gunavardhan</forename><surname>Kakulapati</surname></persName>
</author>
<author>
<persName><forename type="first">Avinash</forename><surname>Lakshman</surname></persName>
</author>
<author>
<persName><forename type="first">Alex</forename><surname>Pilchin</surname></persName>
</author>
<author>
<persName><forename type="first">Swaminathan</forename><surname>Sivasubramanian</surname></persName>
</author>
<author>
<persName><forename type="first">Peter</forename><surname>Vosshall</surname></persName>
</author>
<author>
<persName><forename type="first">Werner</forename><surname>Vogels</surname></persName>
</author>
<idno type="DOI">10.1145/1294261.1294281</idno>
</analytic>
<monogr>
<title level="m">Proceedings of twenty-first ACM SIGOPS symposium on Operating systems principles</title>
<meeting>twenty-first ACM SIGOPS symposium on Operating systems principles</meeting>
<imprint>
<publisher>ACM</publisher>
<date type="published" when="2007-10-14">2007</date>
</imprint>
</monogr>
<note type="raw_reference">G. DeCandia, D. Hastorun, M. Jampani, G. Kakulapati, A. Lakshman, A. Pilchin, S. Sivasubramanian, P. Vosshall, and W. Vogels. Dynamo: Amazon&apos;s highly available key-value store. In Proceeding of the ACM SIGOPS Operating Systems Review (OSR), 2007.</note>
</biblStruct>
<biblStruct coords="13,72.59,451.97,216.67,7.86;13,72.59,461.59,213.40,7.86;13,72.59,471.22,195.74,7.86;13,72.59,480.84,141.66,7.86" xml:id="b8">
<analytic>
<title level="a" type="main">Embedded inodes and explicit grouping: Exploiting disk bandwidth for small files</title>
<author>
<persName coords=""><forename type="first">G</forename><forename type="middle">R</forename><surname>Ganger</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">M</forename><forename type="middle">F</forename><surname>Kaashoek</surname></persName>
</author>
</analytic>
<monogr>
<title level="m">Proceeding of the USENIX Annual Technical Conference (ATC)</title>
<meeting>eeding of the USENIX Annual Technical Conference (ATC)</meeting>
<imprint>
<date type="published" when="1997">1997</date>
</imprint>
</monogr>
<note type="raw_reference">G. R. Ganger and M. F. Kaashoek. Embedded inodes and explicit grouping: Exploiting disk bandwidth for small files. In Proceeding of the USENIX Annual Technical Conference (ATC), 1997.</note>
</biblStruct>
<biblStruct coords="13,72.59,491.46,194.06,7.86;13,72.59,501.08,188.25,7.86;13,72.59,510.71,199.24,7.86" xml:id="b9">
<analytic>
<title level="a" type="main">The Google file system</title>
<author>
<persName><forename type="first">Sanjay</forename><surname>Ghemawat</surname></persName>
</author>
<author>
<persName><forename type="first">Howard</forename><surname>Gobioff</surname></persName>
</author>
<author>
<persName><forename type="first">Shun-Tak</forename><surname>Leung</surname></persName>
</author>
<idno type="DOI">10.1145/1165389.945450</idno>
</analytic>
<monogr>
<title level="j">ACM SIGOPS Operating Systems Review</title>
<title level="j" type="abbrev">SIGOPS Oper. Syst. Rev.</title>
<idno type="ISSN">0163-5980</idno>
<imprint>
<biblScope unit="volume">37</biblScope>
<biblScope unit="issue">5</biblScope>
<biblScope unit="page" from="29" to="43" />
<date type="published" when="2003-10-19">2003</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">S. Ghemawat, H. Gobioff, and S.-T. Leung. The Google File System. In Proceeding of the ACM SIGOPS Operating Systems Review (OSR), 2003.</note>
</biblStruct>
<biblStruct coords="13,72.59,521.33,215.20,7.86;13,72.59,530.95,204.96,7.86;13,72.59,540.58,108.25,7.86" xml:id="b10">
<analytic>
<title level="a" type="main">Trader Joe’s App Seeks to Ease Store Accessibility</title>
<author>
<persName coords=""><surname>Hortonworks</surname></persName>
</author>
<idno type="DOI">10.1044/2021-0607-trader-joes-accessibility</idno>
<ptr target="http://hortonworks.com/blog/ozone-object-store-hdfs/" />
</analytic>
<monogr>
<title level="m">Ozone: An object store in HDFS</title>
<imprint>
<publisher>American Speech Language Hearing Association</publisher>
<date type="published" when="2014-03">2014. Mar, 2016</date>
</imprint>
</monogr>
<note type="raw_reference">Hortonworks. Ozone: An object store in HDFS. http: //hortonworks.com/blog/ozone-object-store-hdfs/, 2014 (accessed Mar, 2016).</note>
</biblStruct>
<biblStruct coords="13,72.59,551.20,202.79,7.86;13,72.59,560.82,208.81,7.86;13,72.59,570.44,187.36,7.86;13,72.59,580.07,141.66,7.86" xml:id="b11">
<analytic>
<title level="a" type="main">Zookeeper: Wait-free coordination for internet-scale systems</title>
<author>
<persName coords=""><forename type="first">P</forename><surname>Hunt</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">M</forename><surname>Konar</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">F</forename><forename type="middle">P</forename><surname>Junqueira</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">B</forename><surname>Reed</surname></persName>
</author>
</analytic>
<monogr>
<title level="m">Proceeding of the USENIX Annual Technical Conference (ATC)</title>
<meeting>eeding of the USENIX Annual Technical Conference (ATC)</meeting>
<imprint>
<date type="published" when="2010">2010</date>
</imprint>
</monogr>
<note type="raw_reference">P. Hunt, M. Konar, F. P. Junqueira, and B. Reed. Zookeeper: Wait-free coordination for internet-scale systems. In Proceeding of the USENIX Annual Technical Conference (ATC), 2010.</note>
</biblStruct>
<biblStruct coords="13,72.59,590.69,188.68,7.86;13,72.59,600.31,205.46,7.86;13,72.59,609.94,181.83,7.86;13,72.59,619.56,147.68,7.86" xml:id="b12">
<analytic>
<title level="a" type="main">The 4&lt;sup&gt;th&lt;/sup&gt; international workshop on networking meets databases (NetDB&amp;#x2019;08)</title>
<author>
<persName coords=""><forename type="first">J</forename><surname>Kreps</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">N</forename><surname>Narkhede</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">J</forename><surname>Rao</surname></persName>
</author>
<idno type="DOI">10.1109/icdew.2008.4498310</idno>
</analytic>
<monogr>
<title level="m">2008 IEEE 24th International Conference on Data Engineering Workshop</title>
<imprint>
<publisher>IEEE</publisher>
<date type="published" when="2011">2011</date>
</imprint>
</monogr>
<note type="raw_reference">J. Kreps, N. Narkhede, J. Rao, et al. Kafka: A distributed messaging system for log processing. In Proceeding of the USENIX Networking Meets Databases Workshop (NetDB), 2011.</note>
</biblStruct>
<biblStruct coords="13,72.59,630.18,169.67,7.86;13,72.59,639.80,218.55,7.86;13,72.59,649.43,193.72,7.86;13,72.59,659.05,96.78,7.86" xml:id="b13">
<analytic>
<title level="a" type="main">Cassandra</title>
<author>
<persName><forename type="first">Avinash</forename><surname>Lakshman</surname></persName>
</author>
<author>
<persName><forename type="first">Prashant</forename><surname>Malik</surname></persName>
</author>
<idno type="DOI">10.1145/1773912.1773922</idno>
</analytic>
<monogr>
<title level="j">ACM SIGOPS Operating Systems Review</title>
<title level="j" type="abbrev">SIGOPS Oper. Syst. Rev.</title>
<idno type="ISSN">0163-5980</idno>
<imprint>
<biblScope unit="volume">44</biblScope>
<biblScope unit="issue">2</biblScope>
<biblScope unit="page" from="35" to="40" />
<date type="published" when="2010-04-14">2010</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">A. Lakshman and P. Malik. Cassandra: A decentralized structured storage system. In Proceeding of the ACM SIGOPS Operating Systems Review (OSR), number 2, 2010.</note>
</biblStruct>
<biblStruct coords="13,72.59,669.67,201.83,7.86;13,72.59,679.30,214.28,7.86;13,335.61,57.64,206.72,7.86;13,335.61,67.26,105.59,7.86" xml:id="b14">
<analytic>
<title level="a" type="main">Petal</title>
<author>
<persName><forename type="first">Edward</forename><forename type="middle">K</forename><surname>Lee</surname></persName>
</author>
<author>
<persName><forename type="first">Chandramohan</forename><forename type="middle">A</forename><surname>Thekkath</surname></persName>
</author>
<idno type="DOI">10.1145/237090.237157</idno>
</analytic>
<monogr>
<title level="m">Proceedings of the seventh international conference on Architectural support for programming languages and operating systems - ASPLOS-VII</title>
<meeting>the seventh international conference on Architectural support for programming languages and operating systems - ASPLOS-VII</meeting>
<imprint>
<publisher>ACM Press</publisher>
<date type="published" when="1996">1996</date>
</imprint>
</monogr>
<note type="raw_reference">E. K. Lee and C. A. Thekkath. Petal: Distributed virtual disks. In Proceeding of the ACM Architectural Support for Programming Languages and Operating Systems (ASPLOS), 1996.</note>
</biblStruct>
<biblStruct coords="13,335.60,77.88,219.76,7.86;13,335.61,85.26,211.28,7.86;13,335.61,94.89,191.65,7.86;13,335.61,104.51,209.98,7.86" xml:id="b15">
<analytic>
<title level="a" type="main">Andrew: a distributed personal computing environment</title>
<author>
<persName><forename type="first">James</forename><forename type="middle">H</forename><surname>Morris</surname></persName>
</author>
<author>
<persName><forename type="first">Mahadev</forename><surname>Satyanarayanan</surname></persName>
</author>
<author>
<persName><forename type="first">Michael</forename><forename type="middle">H</forename><surname>Conner</surname></persName>
</author>
<author>
<persName><forename type="first">John</forename><forename type="middle">H</forename><surname>Howard</surname></persName>
</author>
<author>
<persName><forename type="first">David</forename><forename type="middle">S</forename><surname>Rosenthal</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">F</forename><forename type="middle">Donelson</forename><surname>Smith</surname></persName>
</author>
<idno type="DOI">10.1145/5666.5671</idno>
</analytic>
<monogr>
<title level="j">Communications of the ACM</title>
<title level="j" type="abbrev">Commun. ACM</title>
<idno type="ISSN">0001-0782</idno>
<idno type="ISSNe">1557-7317</idno>
<imprint>
<biblScope unit="volume">29</biblScope>
<biblScope unit="issue">3</biblScope>
<biblScope unit="page" from="184" to="201" />
<date type="published" when="1986-03">1986</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">J. H. Morris, M. Satyanarayanan, M. H. Conner, J. H. Howard, D. S. Rosenthal, and F. D. Smith. Andrew: A distributed personal computing environment. Communications of the ACM (CACM), 29(3), 1986.</note>
</biblStruct>
<biblStruct coords="13,335.60,115.13,201.37,7.86;13,335.61,124.75,212.05,7.86" xml:id="b16">
<analytic>
<title level="a" type="main">Immediate files</title>
<author>
<persName><forename type="first">Sape</forename><forename type="middle">J</forename><surname>Mullender</surname></persName>
</author>
<author>
<persName><forename type="first">Andrew</forename><forename type="middle">S</forename><surname>Tanenbaum</surname></persName>
</author>
<idno type="DOI">10.1002/spe.4380140407</idno>
</analytic>
<monogr>
<title level="j">Software: Practice and Experience</title>
<title level="j" type="abbrev">Softw: Pract. Exper.</title>
<idno type="ISSN">0038-0644</idno>
<idno type="ISSNe">1097-024X</idno>
<imprint>
<biblScope unit="volume">14</biblScope>
<biblScope unit="issue">4</biblScope>
<biblScope unit="page" from="365" to="368" />
<date type="published" when="1984-04">1984</date>
<publisher>Wiley</publisher>
</imprint>
</monogr>
<note type="raw_reference">S. J. Mullender and A. S. Tanenbaum. Immediate files. Software: Practice and Experience, 14(4), 1984.</note>
</biblStruct>
<biblStruct coords="13,335.60,135.37,196.91,7.86;13,335.61,145.00,212.35,7.86;13,335.61,154.62,211.21,7.86;13,335.61,164.25,200.56,7.86;13,335.61,173.87,171.19,7.86" xml:id="b17">
<analytic>
<title level="a" type="main">OSDI (9th Usenix Symposium on Operating Systems Design and Implementation) advertisement</title>
<author>
<persName coords=""><forename type="first">S</forename><surname>Muralidhar</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">W</forename><surname>Lloyd</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">S</forename><surname>Roy</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">C</forename><surname>Hill</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">E</forename><surname>Lin</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">W</forename><surname>Liu</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">S</forename><surname>Pan</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">S</forename><surname>Shankar</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">V</forename><surname>Sivakumar</surname></persName>
</author>
<idno type="DOI">10.1109/msp.2010.134</idno>
</analytic>
<monogr>
<title level="j">IEEE Security &amp; Privacy Magazine</title>
<title level="j" type="abbrev">IEEE Secur. Privacy Mag.</title>
<idno type="ISSN">1540-7993</idno>
<imprint>
<biblScope unit="volume">8</biblScope>
<biblScope unit="issue">4</biblScope>
<biblScope unit="page" from="c4" to="c4" />
<date type="published" when="2014">2014</date>
<publisher>Institute of Electrical and Electronics Engineers (IEEE)</publisher>
</imprint>
</monogr>
<note type="raw_reference">S. Muralidhar, W. Lloyd, S. Roy, C. Hill, E. Lin, W. Liu, S. Pan, S. Shankar, V. Sivakumar, et al. F4: Facebook&apos;s warm blob storage system. In Proceeding of the USENIX Symposium on Operating Systems Design and Implementation (OSDI), 2014.</note>
</biblStruct>
<biblStruct coords="13,335.60,184.49,183.99,7.86;13,335.61,194.11,97.80,7.86;13,335.61,203.74,206.55,7.86;13,335.61,213.36,105.19,7.86" xml:id="b18">
<monogr>
<author>
<persName coords=""><surname>Oracle</surname></persName>
</author>
<ptr target="https://docs.oracle.com/database/121/ADLOB/toc.htm" />
<title level="m">Database securefiles and large objects developer&apos;s guide</title>
<imprint>
<date type="published" when="2011-03">2011. Mar, 2016</date>
</imprint>
</monogr>
<note type="raw_reference">Oracle. Database securefiles and large objects developer&apos;s guide. https: //docs.oracle.com/database/121/ADLOB/toc.htm, 2011(accessed Mar, 2016).</note>
</biblStruct>
<biblStruct coords="13,335.60,223.98,209.10,7.86;13,335.61,233.61,188.59,7.86;13,335.61,243.23,212.89,7.86;13,335.61,252.85,215.02,7.86;13,335.61,262.48,133.41,7.86" xml:id="b19">
<analytic>
<title level="a" type="main">IndexFS: Scaling File System Metadata Performance with Stateless Caching and Bulk Insertion</title>
<author>
<persName><forename type="first">Kai</forename><surname>Ren</surname></persName>
</author>
<author>
<persName><forename type="first">Qing</forename><surname>Zheng</surname></persName>
</author>
<author>
<persName><forename type="first">Swapnil</forename><surname>Patil</surname></persName>
</author>
<author>
<persName><forename type="first">Garth</forename><surname>Gibson</surname></persName>
</author>
<idno type="DOI">10.1109/sc.2014.25</idno>
</analytic>
<monogr>
<title level="m">SC14: International Conference for High Performance Computing, Networking, Storage and Analysis</title>
<imprint>
<publisher>IEEE</publisher>
<date type="published" when="2014-11">2014</date>
</imprint>
</monogr>
<note type="raw_reference">K. Ren, Q. Zheng, S. Patil, and G. Gibson. Indexfs: Scaling file system metadata performance with stateless caching and bulk insertion. In Proceeding of the IEEE High Performance Computing, Networking, Storage and Analysis (SC), 2014.</note>
</biblStruct>
<biblStruct coords="13,335.60,273.10,215.71,7.86;13,335.61,282.72,212.62,7.86;13,335.61,292.35,207.98,7.86;13,335.61,301.97,20.96,7.86" xml:id="b20">
<analytic>
<title level="a" type="main">The design and implementation of a log-structured file system</title>
<author>
<persName><forename type="first">Mendel</forename><surname>Rosenblum</surname></persName>
</author>
<author>
<persName><forename type="first">John</forename><forename type="middle">K</forename><surname>Ousterhout</surname></persName>
</author>
<idno type="DOI">10.1145/146941.146943</idno>
</analytic>
<monogr>
<title level="j">ACM Transactions on Computer Systems</title>
<title level="j" type="abbrev">ACM Trans. Comput. Syst.</title>
<idno type="ISSN">0734-2071</idno>
<idno type="ISSNe">1557-7333</idno>
<imprint>
<biblScope unit="volume">10</biblScope>
<biblScope unit="issue">1</biblScope>
<biblScope unit="page" from="26" to="52" />
<date type="published" when="1992-02">1992</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">M. Rosenblum and J. K. Ousterhout. The design and implementation of a log-structured file system. ACM Transactions on Computer Systems (TOCS), 10(1), 1992.</note>
</biblStruct>
<biblStruct coords="13,335.60,312.59,218.88,7.86;13,335.61,322.21,193.21,7.86;13,335.61,331.84,201.42,7.86;13,335.61,341.46,148.11,7.86" xml:id="b21">
<analytic>
<title level="a" type="main">Design and implementation of the Sun network file system</title>
<author>
<persName coords=""><forename type="first">R</forename><surname>Sandberg</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">D</forename><surname>Goldberg</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">S</forename><surname>Kleiman</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">D</forename><surname>Walsh</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">B</forename><surname>Lyon</surname></persName>
</author>
</analytic>
<monogr>
<title level="m">Proceeding of the USENIX Summer Technical Conference</title>
<meeting>eeding of the USENIX Summer Technical Conference</meeting>
<imprint>
<date type="published" when="1985">1985</date>
</imprint>
</monogr>
<note type="raw_reference">R. Sandberg, D. Goldberg, S. Kleiman, D. Walsh, and B. Lyon. Design and implementation of the Sun network file system. In Proceeding of the USENIX Summer Technical Conference, 1985.</note>
</biblStruct>
<biblStruct coords="13,335.60,352.08,220.11,7.86;13,335.61,361.71,214.73,7.86;13,335.61,371.33,220.32,7.86;13,335.61,380.95,70.89,7.86" xml:id="b22">
<analytic>
<title level="a" type="main">Transaction support in a log-structured file system</title>
<author>
<persName coords=""><forename type="first">M</forename><forename type="middle">I</forename><surname>Seltzer</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">K</forename><surname>Bostic</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">M</forename><forename type="middle">K</forename><surname>Mckusick</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">C</forename><surname>Staelin</surname></persName>
</author>
<idno type="DOI">10.1109/icde.1993.344029</idno>
</analytic>
<monogr>
<title level="m">Proceedings of IEEE 9th International Conference on Data Engineering</title>
<meeting>IEEE 9th International Conference on Data Engineering</meeting>
<imprint>
<publisher>IEEE Comput. Soc. Press</publisher>
<date type="published" when="1993">1993</date>
</imprint>
</monogr>
<note type="raw_reference">M. Seltzer, K. Bostic, M. K. Mckusick, and C. Staelin. An implementation of a log-structured file system for UNIX. In Proceeding of the USENIX Winter Technical Conference, 1993.</note>
</biblStruct>
<biblStruct coords="13,335.60,391.57,208.42,7.86;13,335.61,401.20,219.47,7.86;13,335.61,410.82,200.57,7.86;13,335.61,420.45,59.27,7.86" xml:id="b23">
<analytic>
<title level="a" type="main">The Hadoop Distributed File System</title>
<author>
<persName><forename type="first">Konstantin</forename><surname>Shvachko</surname></persName>
</author>
<author>
<persName><forename type="first">Hairong</forename><surname>Kuang</surname></persName>
</author>
<author>
<persName><forename type="first">Sanjay</forename><surname>Radia</surname></persName>
</author>
<author>
<persName><forename type="first">Robert</forename><surname>Chansler</surname></persName>
</author>
<idno type="DOI">10.1109/msst.2010.5496972</idno>
</analytic>
<monogr>
<title level="m">2010 IEEE 26th Symposium on Mass Storage Systems and Technologies (MSST)</title>
<imprint>
<publisher>IEEE</publisher>
<date type="published" when="2010-05">2010</date>
</imprint>
</monogr>
<note type="raw_reference">K. Shvachko, H. Kuang, S. Radia, and R. Chansler. The Hadoop Distributed File System. In Proceeding of the IEEE Mass Storage Systems and Technologies (MSST), 2010.</note>
</biblStruct>
<biblStruct coords="13,335.60,431.07,201.53,7.86;13,335.61,440.69,128.32,7.86" xml:id="b24">
<analytic>
<title level="a" type="main">Zero copy I: User-mode perspective</title>
<author>
<persName coords=""><forename type="first">D</forename><surname>Stancevic</surname></persName>
</author>
</analytic>
<monogr>
<title level="j">Linux Journal</title>
<imprint>
<biblScope unit="issue">105</biblScope>
<date type="published" when="2003">2003. 2003</date>
</imprint>
</monogr>
<note type="raw_reference">D. Stancevic. Zero copy I: User-mode perspective. Linux Journal, 2003(105), 2003.</note>
</biblStruct>
<biblStruct coords="13,335.60,451.31,214.55,7.86;13,335.61,460.93,195.63,7.86;13,335.61,470.56,216.86,7.86;13,335.61,480.18,175.83,7.86;13,335.61,489.80,146.81,7.86" xml:id="b25">
<analytic>
<title level="a" type="main">Chord</title>
<author>
<persName><forename type="first">Ion</forename><surname>Stoica</surname></persName>
</author>
<author>
<persName><forename type="first">Robert</forename><surname>Morris</surname></persName>
</author>
<author>
<persName><forename type="first">David</forename><surname>Karger</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">M</forename><forename type="middle">Frans</forename><surname>Kaashoek</surname></persName>
</author>
<author>
<persName><forename type="first">Hari</forename><surname>Balakrishnan</surname></persName>
</author>
<idno type="DOI">10.1145/964723.383071</idno>
</analytic>
<monogr>
<title level="j">ACM SIGCOMM Computer Communication Review</title>
<title level="j" type="abbrev">SIGCOMM Comput. Commun. Rev.</title>
<idno type="ISSN">0146-4833</idno>
<imprint>
<biblScope unit="volume">31</biblScope>
<biblScope unit="issue">4</biblScope>
<biblScope unit="page" from="149" to="160" />
<date type="published" when="2001-08-27">2001</date>
<publisher>Association for Computing Machinery (ACM)</publisher>
</imprint>
</monogr>
<note type="raw_reference">I. Stoica, R. Morris, D. Karger, M. F. Kaashoek, and H. Balakrishnan. Chord: A scalable peer-to-peer lookup service for Internet applications. In Proceeding of the ACM Special Interest Group on Data Communication (SIGCOMM), 2001.</note>
</biblStruct>
<biblStruct coords="13,335.60,500.42,212.38,7.86;13,335.61,510.05,158.94,7.86;13,335.61,519.67,204.82,7.86;13,335.61,529.30,108.25,7.86" xml:id="b26">
<monogr>
<title level="m" type="main">Strategic Use of Language in White House Twitter Communications</title>
<author>
<persName><forename type="first">Margo</forename><surname>Jolet</surname></persName>
</author>
<idno type="DOI">10.31390/gradschool_theses.4527</idno>
<ptr target="https://blog.twitter.com/2012/blobstore-twitter-s-in-house-photo-storage-system" />
<imprint>
<date type="published" when="2011-03">2011. Mar, 2016</date>
<publisher>Louisiana State University Libraries</publisher>
</imprint>
</monogr>
<note type="raw_reference">Twitter. Blobstore: Twitter&apos;s in-house photo storage system. https://blog.twitter.com/2012/ blobstore-twitter-s-in-house-photo-storage-system, 2011 (accessed Mar, 2016).</note>
</biblStruct>
<biblStruct coords="13,335.60,539.92,204.84,7.86;13,335.61,549.54,218.41,7.86;13,335.61,559.16,213.46,7.86;13,335.61,568.79,184.05,7.86;13,335.61,578.41,122.88,7.86" xml:id="b27">
<analytic>
<title level="a" type="main">OSDI (9th Usenix Symposium on Operating Systems Design and Implementation) advertisement</title>
<author>
<persName coords=""><forename type="first">S</forename><forename type="middle">A</forename><surname>Weil</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">S</forename><forename type="middle">A</forename><surname>Brandt</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">E</forename><forename type="middle">L</forename><surname>Miller</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">D</forename><forename type="middle">D</forename><surname>Long</surname></persName>
</author>
<author>
<persName coords=""><forename type="first">C</forename><surname>Maltzahn</surname></persName>
</author>
<idno type="DOI">10.1109/msp.2010.134</idno>
</analytic>
<monogr>
<title level="j">IEEE Security &amp; Privacy Magazine</title>
<title level="j" type="abbrev">IEEE Secur. Privacy Mag.</title>
<idno type="ISSN">1540-7993</idno>
<imprint>
<biblScope unit="volume">8</biblScope>
<biblScope unit="issue">4</biblScope>
<biblScope unit="page" from="c4" to="c4" />
<date type="published" when="2006">2006</date>
<publisher>Institute of Electrical and Electronics Engineers (IEEE)</publisher>
</imprint>
</monogr>
<note type="raw_reference">S. A. Weil, S. A. Brandt, E. L. Miller, D. D. Long, and C. Maltzahn. Ceph: A scalable, high-performance distributed file system. In Proceeding of the USENIX Symposium on Operating Systems Design and Implementation (OSDI), 2006.</note>
</biblStruct>
<biblStruct coords="13,335.60,589.03,171.73,7.86;13,335.61,598.66,175.33,7.86;13,335.61,608.28,184.13,7.86;13,335.61,617.90,219.84,7.86;13,335.61,627.53,184.77,7.86" xml:id="b28">
<analytic>
<title level="a" type="main">CRUSH: Controlled, Scalable, Decentralized Placement of Replicated Data</title>
<author>
<persName><forename type="first">Sage</forename><surname>Weil</surname></persName>
</author>
<author>
<persName><forename type="first">Scott</forename><surname>Brandt</surname></persName>
</author>
<author>
<persName><forename type="first">Ethan</forename><surname>Miller</surname></persName>
</author>
<author>
<persName><forename type="first">Carlos</forename><surname>Maltzahn</surname></persName>
</author>
<idno type="DOI">10.1109/sc.2006.19</idno>
</analytic>
<monogr>
<title level="m">ACM/IEEE SC 2006 Conference (SC&apos;06)</title>
<imprint>
<publisher>IEEE</publisher>
<date type="published" when="2006-11">2006</date>
</imprint>
</monogr>
<note type="raw_reference">S. A. Weil, S. A. Brandt, E. L. Miller, and C. Maltzahn. CRUSH: Controlled, scalable, decentralized placement of replicated data. In Proceeding of the IEEE High Performance Computing, Networking, Storage and Analysis (SC), 2006.</note>
</biblStruct>
<biblStruct coords="13,335.60,638.15,203.63,7.86;13,335.61,647.77,194.42,7.86;13,335.61,657.40,200.11,7.86;13,335.61,667.02,205.67,7.86" xml:id="b29">
<analytic>
<title level="a" type="main">hFS</title>
<author>
<persName><forename type="first">Zhihui</forename><surname>Zhang</surname></persName>
</author>
<author>
<persName><forename type="first">Kanad</forename><surname>Ghose</surname></persName>
</author>
<idno type="DOI">10.1145/1272996.1273016</idno>
</analytic>
<monogr>
<title level="m">Proceedings of the 2nd ACM SIGOPS/EuroSys European Conference on Computer Systems 2007</title>
<meeting>the 2nd ACM SIGOPS/EuroSys European Conference on Computer Systems 2007</meeting>
<imprint>
<publisher>ACM</publisher>
<date type="published" when="2007-03-21">2007</date>
</imprint>
</monogr>
<note type="raw_reference">Z. Zhang and K. Ghose. hFS: A hybrid file system prototype for improving small file and metadata performance. In Proceeding of the ACM European Conference on Computer Systems (EuroSys), 2007.</note>
</biblStruct>
</listBibl>
</div>
</back>
</text>
</TEI>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment