|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
% The Legrand Orange Book
|
|
% LaTeX Template
|
|
% Version 2.4 (26/09/2018)
|
|
%
|
|
% This template was downloaded from:
|
|
% http://www.LaTeXTemplates.com
|
|
%
|
|
% Original author:
|
|
% Mathias Legrand (legrand.mathias@gmail.com) with modifications by:
|
|
% Vel (vel@latextemplates.com)
|
|
%
|
|
% License:
|
|
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
|
|
%
|
|
% Compiling this template:
|
|
%
|
|
% 1) pdflatex main
|
|
% 2) makeindex main.idx -s StyleInd.ist
|
|
% 3) biber main
|
|
% 4) pdflatex main x 2
|
|
%
|
|
% After this, when you wish to update the bibliography/index use the appropriate
|
|
% command above and make sure to compile with pdflatex several times
|
|
% afterwards to propagate your changes to the document.
|
|
%
|
|
% Chapter heading images should have a 2:1 width:height ratio,
|
|
% e.g. 920px width and 460px height.
|
|
%
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
% PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
\documentclass[12pt,fleqn]{book} % Default font size and left-justified equations
|
|
\usepackage[automake]{glossaries}
|
|
\usepackage{amssymb}
|
|
\usepackage{wasysym}
|
|
\usepackage{eurosym}
|
|
\usepackage{amsmath}
|
|
\usepackage{numprint}
|
|
\usepackage{bytefield}
|
|
\usepackage{siunitx}
|
|
\usepackage{placeins}
|
|
\usepackage{pgf-umlsd}
|
|
\usepackage{adjustbox}
|
|
\usepackage{multirow}
|
|
\usepackage{enumitem}
|
|
\usepackage{hhline}
|
|
\usepackage{pgfplots}
|
|
\usepackage{float}
|
|
|
|
\let\Oldsection\section
|
|
\renewcommand{\section}{\FloatBarrier\Oldsection}
|
|
|
|
\let\Oldsubsection\subsection
|
|
\renewcommand{\subsection}{\FloatBarrier\Oldsubsection}
|
|
|
|
\let\Oldsubsubsection\subsubsection
|
|
\renewcommand{\subsubsection}{\FloatBarrier\Oldsubsubsection}
|
|
|
|
\author{Ludovic `Archivist' Lagouardette}
|
|
\title{Advanced Storage system}
|
|
\date{2019}
|
|
|
|
\makeglossaries
|
|
\input{structure.tex} % Insert the commands.tex file which contains the majority of the structure behind the template
|
|
|
|
|
|
%\hypersetup{pdftitle={Title},pdfauthor={Author}} % Uncomment and fill out to include PDF metadata for the author and title of the book
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
\begin{document}
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
% TITLE PAGE
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
\begingroup
|
|
\thispagestyle{empty} % Suppress headers and footers on the title page
|
|
\begin{tikzpicture}[remember picture,overlay]
|
|
\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth]{background.pdf}};
|
|
\draw (current page.center) node [fill=ocre!30!white,fill opacity=0.6,text opacity=1,inner sep=1cm]{\Huge\centering\bfseries\sffamily\parbox[c][][t]{\paperwidth}{\centering Advanced Storage System\\[15pt] % Book title
|
|
{\Large Whitepaper}\\[20pt] % Subtitle
|
|
{\small Ludovic `Archivist' Lagouardette}}}; % Author name
|
|
\end{tikzpicture}
|
|
\vfill
|
|
\endgroup
|
|
\frontmatter
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
% COPYRIGHT PAGE
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
\newpage
|
|
~\vfill
|
|
\thispagestyle{empty}
|
|
|
|
\noindent Copyright \copyright\ 2019 NekoIT\\ % Copyright notice
|
|
|
|
\noindent \textsc{Published by NekoIT}\\ % Publisher
|
|
|
|
\noindent\includegraphics[width=8cm]{Pictures/nekoit_title}
|
|
|
|
\noindent \textsc{https://archivist.nekoit.xyz}\\ % URL
|
|
|
|
\noindent This document is under intellectual property of NekoIT, reproduction is allowed in digital format only.\\ % License information, replace this with your own license (if any)
|
|
|
|
\noindent \textit{First printing, 2019} % Printing/edition date
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
% TABLE OF CONTENTS
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
%\usechapterimagefalse % If you don't want to include a chapter image, use this to toggle images off - it can be enabled later with \usechapterimagetrue
|
|
|
|
\chapterimage{chapter_head_1.jpg} % Table of contents heading image
|
|
|
|
|
|
|
|
\cleardoublepage % Forces the first chapter to start on an odd page so it's on the right side of the book
|
|
|
|
\pagestyle{fancy} % Enable headers and footers again
|
|
|
|
\chapter{Introduction}
|
|
|
|
\pagestyle{empty} % Disable headers and footers for the following pages
|
|
\tableofcontents % Print the table of contents itself
|
|
\pagestyle{fancy} % Enable headers and footers again
|
|
|
|
\mainmatter
|
|
\part{Human friendly specification}
|
|
|
|
\chapter{The state of cloud storage}
|
|
|
|
\vspace{-5em}
|
|
\begin{center}
|
|
{\tiny All brands and companies belong to their rightful owners. We are independent from them and are only quoting them for the sake of comparison.}
|
|
\end{center}
|
|
\vspace{5em}
|
|
|
|
\vspace{-3em}\hspace{-1.4em}Nowadays, cloud storage is getting a fair amount of popularity: saving space on mobile devices, backing up important data, getting access on data while using multiple devices.
|
|
|
|
\section{Competition}
|
|
|
|
Multiple service providers, from Google to Amazon as well as some other smaller actors have tackled the task of storing data for a variety of use-cases, for a variety of pricing tables and options.
|
|
|
|
In this section we will take a look at the current state of competition. We do not aim at taking a look at the whole market but at a number of important or interesting actors.
|
|
|
|
\subsection{Google Drive and Google Cloud Platform}
|
|
|
|
Google is famous for its economical impact on the software development industry. This is also true of its Google Drive and Google Cloud Platform products.
|
|
|
|
It is to the point where you can label its product the cheapest way to backup multiple terabytes\autocite{ltt_backup}. You can also quote their other products Google Cloud Storage as a cheap yet very efficient tool tool to store data for applications and websites.
|
|
|
|
They however do not offer any kind of protection on their services, the data stored on their side is not encrypted and they may use it for advertisement purposes for example. They however are not misleading on their offer even if their product is not privacy centered at all.
|
|
|
|
\subsection{Amazon Cloud Drive and their variety of services}
|
|
|
|
We will not expand on how varied and efficient Amazon cloud storage is. They basically provide about all types of storage for any type of data structure from the typical file system to the most advanced layouts of databases.
|
|
|
|
Their prices are slightly higher than those of Google. Like Google however, they only propose encryption of the data while in transit.
|
|
|
|
\subsection{Operation Tulip (NextCloud over Ceph)}
|
|
|
|
An open-source initiative to propose a simple cloud suite with some file storage and some tools like a calendar and an online LibreOffice implementation.
|
|
|
|
This service is in open beta (can be tested by anyone) and uses some of the most used open source software to hold encrypted data and deal with storage redundancy: Ceph and NextCloud.
|
|
|
|
It is however not to be used for actively using the data but more as a backup solution and cold storage.
|
|
|
|
\subsection{Backblaze}
|
|
|
|
A data backup company that offers multiple solutions with diverse options. They permit the use of forms of secure encryption. They however do not encrypt all of the metadata and reserve the right to sell those to a variety of company.
|
|
|
|
They also offer a storage for live data in one of their offers. Their prizes are relatively competitive compared to Amazon Cloud Storage for example.
|
|
|
|
\subsection{Dropbox}
|
|
|
|
A well used actor in backup cloud storage system. They provide multiple tiers of pricing, from a free offer to multiple paid storage offers. All of them are meant for dead storage for roaming and sharing files.
|
|
|
|
\subsection{Tarsnap}
|
|
|
|
A small actor based in Canada. they offer cold storage services, encrypted and open-source on client side. They pricing is on a \textit{"as you go"} basis, pricing network traffic as well as storage used.
|
|
|
|
\section{Technology}
|
|
|
|
Multiple technology and they open-source counterparts can be used to handle online data storage. In this section we will explore those possibilities by comparing both commercial and free solutions where possible.
|
|
|
|
\subsection{Google Spanner and CockroachDB}
|
|
|
|
Google Spanner and CockroachDB are two database software for geo-replicated databases. They both use a clock based mechanism for handling transactions, making them faster with better clock synchronization. CockroachDB have however lower requirements on clock accuracy that Google Spanner does\autocite{cockroach_atomic}.
|
|
|
|
Google Spanner is as its name implies a proprietary product from Google. CockroachDB is an open-source project from CockroachLabs made to implement as much of Google Spanner features as possible. It also intends to try to be compatible with PostgreSQL to ease application porting\autocite{cockroach_postgres}.
|
|
|
|
Both of those tools can be used to implement either a block based storage or an object storage to use to implement a geo-replicated filesystem.
|
|
|
|
Using CockroachDB as a back-end to implement the system was envisioned, but latency tests made us choose to rather use a custom implemented data server. We however use a very similar way of resolving database conflict (see the sequence diagrams \ref{fig:confirmation_proto} and \ref{fig:2user_confirmation_proto} at page \pageref{fig:confirmation_proto}).
|
|
|
|
\subsection{Ceph, RADOS and CRUSH}
|
|
|
|
Ceph is a distributed data storage system. It uses the RADOS (Reliable Autonomic Distributed Object Store), a storage system designed around the idea of placing data in predictable place following a mathematical equation. This is named CRUSH, for Controlled Replication Under Scalable Hashing.
|
|
|
|
Placement of data in our system follows some concepts from Ceph, RADOS and CRUSH.
|
|
|
|
This system is currently in development by the CERN. They use it as a back-end for many types of storage, from filesystems to block devices and storage for scientific data before analysis.
|
|
|
|
Like mentioned in the listing of other actors, the Operation Tulip project are using it to store the files they manipulate. Other not mentioned actors like Ovh use it too for example for storing Virtual Machines and as storage for cloud computing.
|
|
|
|
\subsection{NextCloud}
|
|
|
|
NextCloud is an open-source system written in PHP to be used as a front-end for cloud hosting. It supports WebDAV and other protocols as well as providing multiple productivity features from text edition to spreadsheets and calendars.
|
|
|
|
It however is slow due to having been designed in a programming language unsuitable for performance applications.
|
|
|
|
It supports end to end encryption.
|
|
|
|
\section{Hardware and hosting}
|
|
|
|
Naming it cloud storage doesn't mean the data is in some phantasmagorical place. As such we will study here the possibilities for one to deploy his own cluster of servers to host his own data.
|
|
|
|
For that we will compare pricing of the hardware required to deploy our solution online for a data size around 50\si{\tera{}B (\pm 5\percent)} of storage.
|
|
|
|
\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{|l|l|l|l|}
|
|
\hline
|
|
System & Upfront price & Price per GB per year & Amort. (y) \\\hhline{|=|=|=|=|}
|
|
SuperMicro SC825TQ-560LP $\times 3$ & USD15100 & USD0.21 & 5 \\
|
|
and SuperMicro 5018D-MF (new) & +USD900/m & & \\\hline
|
|
HP~ProLiant~DL180-G5 $\times 4$ & USD3350 & USD0.21 & 3 \\
|
|
(refurbished) & +USD900/m & & \\\hline
|
|
Ovh rented servers $\times 4$ & USD890/m & USD0.21 & 0 \\\hline
|
|
\end{tabular}
|
|
\textit{It is to be noted that the performance is also decreasing with each category down.}
|
|
\caption{Server pricing}
|
|
\label{tab:server_pricing}
|
|
\end{table}
|
|
|
|
\subsection{Brand new hardware}
|
|
|
|
Brand new hardware is generally a real investment for an individual or a new company. It is also a technical choice that can have lifelong consequences on the business as computers are subdivided in families that each have specific features and behaviours.
|
|
|
|
Of those families, named architectures, we will consider two: the \texttt{x86\_{}64}, also referred as \texttt{amd64}; and the most recent architecture from the \texttt{ARM} group, the \texttt{ARMv8} architecture and its variants.
|
|
|
|
Both of them share the minimal set of features for a type of storage named a \texttt{memory mapped hash table} to be implementable to a usable degree.
|
|
|
|
\subsubsection{\texttt{x86\_{}64} architecture}
|
|
|
|
This architecture is common to most modern computers, laptops, workstations and servers nowadays. It is therefore easy to make software for it and it is well documented.
|
|
|
|
It however have the huge drawback of being power hungry, having been extended for more and more performance, it tends to be consume lots of power and hence, to require proportional cooling.
|
|
|
|
Taking for example a server from SuperMicro SC825TQ-560LP, we estimate a price of around 4'500USD per server for the data storage, requiring at least 3 of them, additional ones for ensuring safety in case one of them fails, as well as any other server for handling coordination of data storage.
|
|
|
|
For that we advise a server of the likes of a SuperMicro 5018D-MF, for which we estimate a price of about 1'600USD if equipped with a proper network card for handling connections to the storage servers properly.
|
|
|
|
\subsubsection{\texttt{ARMv8} architecture}
|
|
|
|
This architecture being relatively new, we will not adventure into pricing it, but we think that adapted servers for storage equipped with ThunderX2 CPUs from Cavium would do well as storage server and likewise equipped servers with a ThunderX2 adapted for computationally heavy loads would fit the use case as a coordination server.
|
|
|
|
This setup is however untested and it would not be possible at the time of redaction of these lines to test it for us. These servers would also not run some operating systems critical for safety of network infrastructure like OpenBSD.
|
|
|
|
\subsection{Refurbished hardware}
|
|
|
|
As for refurbished hardware, we looked into the products of professionals in sales of refurbished hardware. We advise for those with small budget a constellation of HP~ProLiant~DL180-G5, with a price per server of about 950USD (disks being new), and any server with a decent enough set of network connectivity to not be a bottleneck.
|
|
|
|
\subsection{Rented dedicated servers}
|
|
|
|
As for dedicated servers, we got our eyes on Ovh, which would propose to rent servers for 230USD per month per server with an added 80USD per month for the coordination server. This doesn't encompasses any backup server additionally needed to guarantee fast replication if one of the three servers fails, but it takes into account all hosting costs.
|
|
|
|
\section{The users}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{tikzpicture}
|
|
\begin{axis}[
|
|
ybar,
|
|
enlargelimits=0.15,
|
|
legend style={at={(0.5,-0.15)},
|
|
anchor=north,legend columns=-1},
|
|
ylabel={\#Percentage on interrogated people},
|
|
symbolic x coords={Not so concerned,Concerned,Very concerned},
|
|
xtick=data,
|
|
nodes near coords,
|
|
nodes near coords align={vertical},
|
|
]
|
|
\addplot coordinates {(Not so concerned,18.2) (Concerned,13.8) (Very concerned,68.2)};
|
|
\end{axis}
|
|
\end{tikzpicture}
|
|
\caption{Concerns about privacy}
|
|
\label{fig:privacy_concerns}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{tikzpicture}
|
|
\begin{axis}[
|
|
ybar,
|
|
enlargelimits=0.15,
|
|
legend style={at={(0.5,-0.15)},
|
|
anchor=north,legend columns=-1},
|
|
ylabel={\#Percentage on interrogated people},
|
|
symbolic x coords={A,B,C,D},
|
|
xtick=data,
|
|
nodes near coords,
|
|
nodes near coords align={vertical},
|
|
]
|
|
\addplot coordinates {(A,15.8) (B,52.6) (C,63.2) (D,78.9)};
|
|
\end{axis}
|
|
\end{tikzpicture}
|
|
\begin{minipage}{0.82\textwidth}
|
|
\begin{itemize}
|
|
\item A: Harddrive encryption (Hardware or commercial solution)
|
|
\item B: Harddrive encryption (Open-source solution)
|
|
\item C: VPN
|
|
\item D: Telegram
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\caption{Use of privacy enabling tools}
|
|
\label{fig:privacy_tools}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{tikzpicture}
|
|
\begin{axis}[
|
|
ybar,
|
|
enlarge x limits=0.15,
|
|
ymin=0, ymax=100,
|
|
xmin=A, xmax=D,
|
|
legend style={at={(0.5,0.0)},
|
|
anchor=north,legend columns=-1},
|
|
ylabel={\#Percentage on interrogated people},
|
|
symbolic x coords={A,B,C,D},
|
|
xtick=data,
|
|
nodes near coords,
|
|
nodes near coords align={vertical},
|
|
]
|
|
\addplot coordinates {(A,86.4) (B,9.1) (C,0.0) (D,4.5)};
|
|
\end{axis}
|
|
\end{tikzpicture}
|
|
\begin{minipage}{0.82\textwidth}
|
|
\begin{itemize}
|
|
\item A: Open-source community approved cryptography
|
|
\item B: Government approved cryptography
|
|
\item C: Hardware implemented cryptography
|
|
\item D: I don't know
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\caption{Opinions on cryptography}
|
|
\label{fig:crypto_tools}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{tikzpicture}
|
|
\begin{axis}[
|
|
ybar,
|
|
enlargelimits=0.15,
|
|
legend style={at={(0.5,-0.15)},
|
|
anchor=north,legend columns=-1},
|
|
ylabel={\#Percentage on interrogated people},
|
|
symbolic x coords={Yes (Encrypted),Yes,No},
|
|
xtick=data,
|
|
nodes near coords,
|
|
nodes near coords align={vertical},
|
|
]
|
|
\addplot coordinates {(Yes (Encrypted),27.3) (Yes,18.2) (No,54.5)};
|
|
\end{axis}
|
|
\end{tikzpicture}
|
|
\caption{Use of private online data storage}
|
|
\label{fig:cloud_usage}
|
|
\end{figure}
|
|
|
|
\chapter{A personal view on privacy}
|
|
|
|
\vspace{-5em}
|
|
\begin{center}
|
|
{\tiny This chapter expresses the view of the author of both this documentation and the software associated with it and only of him.}
|
|
\end{center}
|
|
\vspace{5em}
|
|
|
|
\vspace{-3em}\hspace{-1.4em}Privacy is a notion of everyday. Everyday people use object made to guarantee some, from curtains to acoustic insulation, from locked doors to security cabinets, privacy is something that concerns doctors, lawyers, engineers, inventors, chefs, military staff\ldots{}
|
|
|
|
Sometime privacy is an indirect concern: an archival company should not take a peek at your doctor's or lawyer's files and cases. Sometimes such an indirect concern reaches to be a concern of someone through friends, business partners, lovers\ldots{} You would not want someone to learn your friend's secrets through you.
|
|
|
|
50 years ago, someone wanting to learn your secrets had to listen on your telephone line, breach in your office and open your safe, go in your house or hire a detective. Today, that person may just be able to buy your secrets.
|
|
|
|
\vspace{1em}In this chapter we will talk about a variety of topics related to digital privacy, from its premises to its implementation.
|
|
|
|
\section{On terms of service}
|
|
|
|
In the software and service industry, terms and conditions of service are the typical way for a company to announce the type of data they collect and the use they make of said data.
|
|
|
|
Those conditions may also tell a person to who the data sent on their service belongs, some services taking property of, for example, all and any picture uploaded to them.
|
|
|
|
This is in my opinion problematic from a moral standpoint when it is not explicit that the service acquires your information with your consent but on terms you may not entirely agree with for the simple reason that those terms are buried into a huge quantity of legal information.
|
|
|
|
The projection of that issue is when the very same terms and conditions allow for the company to sell or provide the information, generally non-anonymized, to a third party without additional demand for consent. This id extremely common in companies that offer services "for free" or for very low prices compared to the cost of the actual service.
|
|
|
|
\section{On advertisement}
|
|
|
|
Advertisement is a very close issue to the one above. Most advertisements online run code on the computer than sees the advertisement to ensure the advertisement is seen by a human and not a computer. The advertisement also collect information to uniquely identify the user and link the user to the data in the page and website the user is visiting. This allows the advertisement company to run finely adapted advertisement.
|
|
|
|
One of the bad consequence of that is what we saw during the Cambridge Analytica incident in 2017, when the company of the same name got tasked to influence voters of the United States of America targeting them specifically on points of the opposing party they were likely not agreeing on with advertisement and viral videos.
|
|
|
|
In that very scandal, it had appeared the developed database could for example accurately point out users that were in favour of free access to guns\autocite{cambridge_analytica}.
|
|
|
|
Those are however not the average practices. For example the DuckDuckGo search engine providers only provide advertisement depending on the exact query entered and nothing more, not collecting any data. On a grayer side, Twitter provide access to tweet statistics and advertise this feature to all users, letting you know how they get their money and offering you to go from user to consumer very easily, which is a better practice than silently collecting data for sales to companies only.
|
|
|
|
\section{On manipulation and political acts}
|
|
|
|
As technology evolved, we gained power to make links on multiple pieces of data about users like presented in the previous part of this chapter, we also showed how data collection can be used to further a political agenda with targeted advertisement. But this omit the most clear and easily forgotten form of political warfare, collecting the other side's information at the source. This also applies to people that may want to blackmail someone else or just ruin their reputation for hidden motives or just the challenge of it.
|
|
|
|
Numerous times have we saw such breaches. From the Watergate scandal half a century ago to Apple iCloud breaches in 2014\autocite{bbc_icloud}, stealing data is a typical way of spying on your opponent whatever the game being it political, gambling, contests, art, etc\ldots{} It can also be used to obtain an unfair advantage in trials and other circumstances.
|
|
|
|
We here see the importance of privacy for public figures just as we saw it for voters in the last part. It goes the same for other methods like viral advertising and scientific cherry-picking when pushing a political agenda.
|
|
|
|
It is also valid on a bigger scale like for example, standardization of encryption by a country in order to enforce it to be breakable like it happened in 1977 with the DES~56 encryption primitive\autocite{wiki:Data_Encryption_Standard}.
|
|
|
|
\section{On misrepresentation of encryption}
|
|
\begin{center}
|
|
\textit{If you want to further your understanding of encryption before proceeding, I advise you to take a read at the \autoref{annex:encryption_popularized}}
|
|
\end{center}
|
|
\vspace{2em}
|
|
|
|
\hspace{-1.4em}Encryption is often misrepresented, both by lots of governmental figures and by lots of commercial software providers.
|
|
|
|
Nowadays, most of the web communications are encrypted and at least partially authenticated. Authentication is done through asymmetrical encryption based systems, contacting an intermediate named a certificate provider. Some of the certificates are included in most mainstream browsers (for example, the certificates of \texttt{google.com} are embedded in Android devices and in Google Chrome), which secures the communication with those entities if the private key is not compromised.
|
|
|
|
This doesn't mean that any data sent to those services is encrypted once stored on the provider: most providers do not store data encrypted as it brings computing costs up by a very significant margin if they need to access that data.
|
|
|
|
Similarly, it is considered bad practice to store passwords in a readable format, to protect them, specific cryptographic techniques exist so that it is possible to verify a password from a form of said password transformed with a one way transformation named a cryptographic hash function. That said, some companies still store password in readable form in their databases.
|
|
|
|
This means that, access should be compromised on the database of a company or within any vulnerable part of their computer system, data could be entirely compromised. This has happened a lot in recent years, and is bound to be a phenomena that multiplies should companies not start caring for their customer's privacy.
|
|
|
|
Such compromise can happen in various ways, and having physical access to the machine makes it easy to access most if not all data on the machine. Most companies renting their disk space, servers, or computing power from other companies, it means you rely on companies contractors not to sell your data per the terms of their hosting services too, and encryption of the transit of data will not protect you from this issue.
|
|
|
|
In the meanwhile, all companies play the game of demagogy and present themselves as perfectly secure. Some have the transparency to present you the way their technology works, relying on open-source software to provide their services.
|
|
|
|
\vspace{1em}There is also the topic of back-doors in those services for governmental checks. The main issue with them is the following: if the government can access your data without you knowing, then virtually anyone can do the same. It adds a critical point of failure in the system. It also means the service is not usable for sensitive topics like defense and military uses.
|
|
|
|
On an even worrisome topic, some companies boast to feature encryption of user data, while they only ever ensure this encryption on transit, or advertise it while not all of their offers are actually featuring encryption of user data.
|
|
|
|
\chapter{Izaro storage}
|
|
|
|
The way we decided to implement our storage focuses on protection, obfuscation and performance. We will here explain the influences and consequences of these ideas.
|
|
|
|
\section{Goals}
|
|
|
|
We want to provide an online storage with the following properties:
|
|
|
|
\vspace{0.6em}First of all, it must be georeplicated. It is not okay to lose service access due to the loss of one server farm on our own side.
|
|
|
|
\vspace{0.3em}Then, the data must be protected, we ourselves should be entirely unable to read it, we should also be unable to read the metadata.
|
|
|
|
\vspace{0.3em}Also, any part of the data must be fast enough to access that it is hard to differentiate our service from access of a hard-drive given a good enough network connection, same goes for writing.
|
|
|
|
\vspace{0.3em}Finally, it must be flexible and adaptable to multiple use-cases.
|
|
|
|
\vspace{1em}This leads us to the following idea: we are aiming to create a service that can store encrypted data, it must be able to store it in a layout similar to a disk, this way it possesses the same capabilities as a hard drive disk. The key to decrypt the data is stored online but encrypted with the user password. Authentication requires the user to be able to read the password to get a token. It is possible to leave said token disabled and enable it only with a second authentication factor.
|
|
|
|
We want our system to be protected from the point of view of our customers, as such, we aim at it having a code-base readable and short enough to be explored completely in 3 days by a developer with access to enough documentation.
|
|
|
|
\section{Principles}
|
|
|
|
Our project aim to follow the following principles:
|
|
|
|
\begin{itemize}
|
|
\item Principle of least knowledge: if it is possible for us to never have access to a readable form of some data, then we should not make it mandatory or provide alternatives.
|
|
\item Principle of greater usage: if it is possible, we have to use the most out of the algorithms we use, be it cryptographic primitives or other algorithms.
|
|
\item Principle of openness: we aim to disclose any incident that may happen, and to disclose any request by officials to access anyone's data.
|
|
\end{itemize}
|
|
|
|
The principle of least knowledge is upheld in the very design of the system: only the user can make sense of the address space of both the file system and block device. To provide an analogy, the data is stored in multiple boxes. The user side software randomly labels the boxes and seal them (that seal is the encryption). If you store data that overflows from one box, you will store in multiple boxes. Decrypting any data requires to know which box is the first one and which is the next one, but that very piece of information is not stored on the server: it is stored in one of the sealed boxes.
|
|
|
|
Furthermore, the labels of each block of data can be used as a piece of the encryption process, this is an example of the principle of greater usage: any additional information that can help make the system safer, we will use it.
|
|
|
|
As for the openness principle, it is just as stated, we will disclose any demand that are made as soon as they are made as well as our responses to them. We will disclose any security issue or concern we receive. We will provide tools for anyone to be informed of these information through multiple channels.
|
|
|
|
\section{Data life cycle}
|
|
|
|
Here is a list of the data that may be collected by us in any interaction with our software. This data is sorted by interaction.
|
|
|
|
\begin{table}[h]
|
|
\centering
|
|
\begin{tabular}{lll}
|
|
\hline
|
|
\ttfamily User action & \ttfamily Data collected & \ttfamily Reason \\\hhline{===}
|
|
\multirow{4}{*}{Account creation} & \multicolumn{1}{l}{Email} & \multicolumn{1}{l}{Authentication} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Nickname} & \multicolumn{1}{l}{Authentication} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Password (Obfuscated)} & \multicolumn{1}{l}{Authentication} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Time of creation} & \multicolumn{1}{l}{Bookkeeping} \\\hline
|
|
\multirow{1}{*}{Connection} & \multicolumn{1}{l}{Connection time} & \multicolumn{1}{l}{Authentication} \\\hline
|
|
\multirow{3}{*}{Payment (below 20\euro)} & \multicolumn{1}{l}{Amount} & \multicolumn{1}{l}{Accounting} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Time of payment} & \multicolumn{1}{l}{Accounting} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Type of payment} & \multicolumn{1}{l}{Accounting} \\\hline
|
|
\multirow{4}{*}{Payment (above 20\euro)} & \multicolumn{1}{l}{Amount} & \multicolumn{1}{l}{Accounting} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Time of payment} & \multicolumn{1}{l}{Accounting/Bookkeeping} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Type of payment} & \multicolumn{1}{l}{Accounting} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Invoice address} & \multicolumn{1}{l}{Accounting} \\\hline
|
|
\multirow{2}{*}{Writing data} & \multicolumn{1}{l}{Server time} & \multicolumn{1}{l}{Data protection (Consensus system)} \\\cline{2-3}
|
|
& \multicolumn{1}{l}{Number of used blocks} & \multicolumn{1}{l}{Accounting/Bookkeeping} \\\hline
|
|
|
|
\end{tabular}
|
|
\caption{Table of collected data}
|
|
\label{tab:data_collection}
|
|
\end{table}
|
|
|
|
|
|
\chapter{A personal view on business practices}
|
|
\section{On selling the user}
|
|
\section{On misrepresenting the invisible}
|
|
|
|
\autocite{apple_sweatshop}
|
|
|
|
\section{On practice of transparent business}
|
|
|
|
\part{Functional specification}
|
|
|
|
\chapter{Client capabilities}
|
|
\section{Key header}
|
|
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can be version-controlled
|
|
\item Can store a pointer to a root
|
|
\item Do not permit retrieval of key alone
|
|
\end{itemize}
|
|
|
|
\section{System root header}
|
|
|
|
\begin{itemize}[label={\Square}]
|
|
\item Is extendable
|
|
\item Can store a pointer to a device and its type
|
|
\item Can store an attribute pointer to for a device
|
|
\end{itemize}
|
|
|
|
\section{Argus block device}
|
|
|
|
\begin{itemize}[label={\Square}]
|
|
\item Requires enough RAM space to store its metadata for access
|
|
\item Reading data is $O(1)$
|
|
\item Uses record $x$ and $y$ as $iv$ and/or $nonce$
|
|
\item Uses stream position as position
|
|
\item Can list multiple keys and ciphers in its header, applied in sequential order
|
|
\item Support transactional access
|
|
\end{itemize}
|
|
|
|
\section{Izaro file system}
|
|
|
|
Let $n$ be the number of files in the file system. Let $m$ be the size of a given file.
|
|
|
|
\begin{itemize}[label={\Square}]
|
|
\item All directory changes are transactional and atomic
|
|
\item Opening a file is $O(log~n)$ or better
|
|
\item Reading a block from an open file is $O(log~m)$ or better
|
|
\item Writing a block from an open file is $O(log~m)$ or better
|
|
\item Changing file attributes is atomic
|
|
\item Uses record $x$, $y$, and UUID as $iv$ and/or $nonce$
|
|
\item Uses file position as position inside files
|
|
\item Uses position in block as position inside non-files elements
|
|
\item[] On clients implementing a POSIX interface
|
|
\begin{itemize}[label={\Square}]
|
|
\item Append is atomic if possible
|
|
\item Advisory locking is implemented on a file by file basis
|
|
\end{itemize}
|
|
\item POSIX permissions are implemented
|
|
\begin{itemize}[label={\Square}]
|
|
\item An association file only modifiable by user 0 (root) maps UIDs to user names
|
|
\item Permissions are handled by the client only
|
|
\item User 0 (root) can alter any permission
|
|
\end{itemize}
|
|
\item NT permissions may not be implemented
|
|
\begin{itemize}[label={\Square}]
|
|
\item NT clients may not perform ACL checks
|
|
\item Any NT user can alter file data
|
|
\item NT users can not alter file permissions
|
|
\item Items created by NT users are created with the permission of their parent item
|
|
\end{itemize}
|
|
\end{itemize}
|
|
|
|
\chapter{Front-end capabilities}
|
|
\section{Web interface}
|
|
\begin{itemize}[label={}]
|
|
\item Account management
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can create an account
|
|
\item Can share a 2FA secret
|
|
\item Can delete an account
|
|
\item Can confirm a payment
|
|
\item Can inform of payment lateness
|
|
\item Can self-wipe account
|
|
\end{itemize}
|
|
|
|
\item Authentication
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can generate a token from a connection request
|
|
\item Can confirm a token with a 2FA subtoken
|
|
\item Can invalidate a token
|
|
\item Can generate a shared secret to handle UDP connections
|
|
\item Can regenerate a password block from an older password block
|
|
\end{itemize}
|
|
|
|
\item Provides a payment link
|
|
\item Provides a link to this document
|
|
\item Provides all personal information stored on our side for the logged in user
|
|
\item Provide a link to our company balance
|
|
\end{itemize}
|
|
|
|
\section{Heavy-clients}
|
|
|
|
\begin{itemize}[label={\Square\Square}]
|
|
\item[{TD}]
|
|
\item Can obtain a shared secret from the data interface
|
|
\item Can obtain a token from a connection
|
|
\item Can obtain a token from user input
|
|
\item Can synchronize a supported filesystem
|
|
\item Can display usage statistics
|
|
\item Can detect a block device and mount it
|
|
\item Can mount IzaroFS using FUSE on systems that support it
|
|
\item Can mount IzaroFS in slow mode on other systems
|
|
\end{itemize}
|
|
|
|
\subsection{Command-line interface client}
|
|
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can be called with automated tools
|
|
\item Supports callback scripts
|
|
\end{itemize}
|
|
|
|
\subsubsection{General options}
|
|
|
|
\textit{Any option marked with a * must be order independent provided it is following the command that defines it.}
|
|
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{----help}*: provides a list of commands and of all the general options, if used with a command, display the help page of that command.
|
|
\item \texttt{----verbose}*: trips a flag that makes any operations to output their information in \texttt{stderr}
|
|
\item \texttt{----out-info}*: trips a flag that makes any operations to output more information in a computer readable format on \texttt{stdout}
|
|
\end{itemize}
|
|
|
|
\subsubsection{Commands}
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: same as the similar option.
|
|
\item \texttt{list}: enter into list mode for the next command, if no further arguments, display a list of all accessible roots and their types if they are known.
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: provide help on the subcommands
|
|
\item \texttt{mountable}: Display all accessible roots that are mountable as file systems.
|
|
\item \texttt{in-use}: Display roots that are currently in use.
|
|
\item \texttt{connections}: Display all registered connections in a \texttt{csv} format with headers as in \autoref{fig:connection_csv_columns}.
|
|
\item \texttt{endpoints}: Display all registered endpoints in a \texttt{csv} format with headers as in \autoref{fig:endpoints_csv_columns}.
|
|
\end{itemize}
|
|
\item \texttt{mount}: Mount a mountable file system, the identifier of the file system must be provided as next argument. Outputs \texttt{OK} on success and \texttt{KO} on failure.
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: provide help on the subcommands
|
|
\item \texttt{----read-only}*: set up the read only flag of the file system
|
|
\end{itemize}
|
|
\item \texttt{login}: performs an interactive login sequence, no information is stored on disk.
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: provide help on the subcommands
|
|
\item \texttt{----persistent}*: trips the system into persistent login, with the key stored on the system in a configuration file. If the \texttt{----unsafe} flag is not activated, does nothing.
|
|
\item \texttt{----unsafe}*: allows unsafe behaviours.
|
|
\end{itemize}
|
|
\item \texttt{logout}: performs a logout of the selected user, will also remove persistent logout information. Outputs \texttt{OK} on success and \texttt{KO} on failure.
|
|
\item \texttt{umount}: performs the unmounting of the provided filesystem. Outputs \texttt{OK} on success and \texttt{KO} on failure.
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: provide help on the subcommands
|
|
\item \texttt{----NOW}*: forces the unmounting to happen even if unterminated operations are pending.
|
|
\end{itemize}
|
|
\item \texttt{enable}: enables a storage registered as block device storage. Outputs \texttt{OK} on success and \texttt{KO} on failure.
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: provide help on the subcommands
|
|
\item \texttt{----read-only}*: set up the read only flag of the device and prohibits its use for write operations.
|
|
\end{itemize}
|
|
\item \texttt{disable}: disables a storage registered as block device storage. Outputs \texttt{OK} on success and \texttt{KO} on failure.
|
|
\begin{itemize}[label={\Square}]
|
|
\item \texttt{help}: provide help on the subcommands
|
|
\item \texttt{----NOW}*: stops the device immediately, ignoring any currently executing operations.
|
|
\end{itemize}
|
|
\end{itemize}
|
|
|
|
Computer readable output format should be a subset of JSON, hence readable by any JSON parser, with the values shown in \autoref{fig:computer_readable_output}.
|
|
|
|
\begin{figure}[h]
|
|
\begin{center}
|
|
\begin{minipage}[c]{0.9\textwidth}
|
|
\begin{itemize}
|
|
\item \texttt{status}: \texttt{"OK"} if the command is a success, \texttt{"KO"} if the command failed.
|
|
\item \texttt{error\_{}code}: an integer representing the error as a hash of the line of code that failed.
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\end{center}
|
|
\caption{Computer readable output contents}
|
|
\label{fig:computer_readable_output}
|
|
\end{figure}
|
|
|
|
Multiple types of data are provided as CSV with the separator ``\texttt{|}'' and the first line always being a header.
|
|
|
|
\begin{figure}[h]
|
|
\begin{center}
|
|
\begin{minipage}[c]{0.9\textwidth}
|
|
\begin{itemize}
|
|
\item \texttt{user}: the email address of the user
|
|
\item \texttt{token}: the token as an hexadecimal string
|
|
\item \texttt{"connected since"}: Unix timestamp in seconds since the connection got opened.
|
|
\item \texttt{"server time offset"}: The estimated offset of time between the server and client in microseconds.
|
|
\item \texttt{persistent}: \texttt{true} if the connection is stored on disk, \texttt{false} if it is stored only in memory.
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\end{center}
|
|
\caption{Connection information}
|
|
\label{fig:connection_csv_columns}
|
|
\end{figure}
|
|
|
|
\begin{figure}[h]
|
|
\begin{center}
|
|
\begin{minipage}[c]{0.9\textwidth}
|
|
\begin{itemize}
|
|
\item \texttt{user}: the email address of the user that owns the connection used
|
|
\item \texttt{"unix user"}: UID of the user that mounted the endpoint
|
|
\item \texttt{"endpoint file"}: the file where the endpoint is mounted if applicable, may be a device on NT systems
|
|
\item \texttt{"root id"}: the identifier of the root
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\end{center}
|
|
\caption{Active endpoint data}
|
|
\label{fig:endpoints_csv_columns}
|
|
\end{figure}
|
|
|
|
\subsection{Desktop client}
|
|
|
|
\textit{Only set to support the core subset of features for heavy-clients.}
|
|
|
|
The desktop application is expected to work on at least Windows and Linux entirely, and to also work on any system with FUSE for the IzaroFS part.
|
|
|
|
Any cosmetic shaders should for that reason be made with OpenGL, and at that with a very compatible version. Development of those shaders will be left to some professional specially trained in making shaders.
|
|
|
|
Those cosmetic items must have a textual fallback accessible by hovering their element. If they represent performance statistics, any visual may be clickable and print the statistics in a file.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\columnwidth]{Pictures/desktopmain}
|
|
\caption{Desktop application main view mockup}
|
|
\label{fig:mockup-desktopmain}
|
|
\end{figure}
|
|
|
|
On \autoref{fig:mockup-desktopmain}, the circle and pie will actually be replaced by a render graphic whose inner part will spin if and only if time synchronization is happening normally, the outer part will grow brighter if elements of progress are made on the selected file system, and the inner part with transition between green and red through yellow showing data congestion.
|
|
|
|
Other render elements could be added to the application for aesthetic purposes, at the sole condition they provide a visually appealing way to understand the status of the system.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\columnwidth]{Pictures/loginpage}
|
|
\caption{Desktop application login and register page mockup}
|
|
\label{fig:mockup-loginpage}
|
|
\end{figure}
|
|
|
|
Behaviour of the login part of the page is expected to be identical to the behaviour of the login command.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[height=0.4\textheight,width=0.9\columnwidth,keepaspectratio]{Pictures/confirmregister}
|
|
\caption{Desktop application registration confirmation page mockup}
|
|
\label{fig:mockup-confirmregister}
|
|
\end{figure}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[height=0.4\textheight,width=0.9\columnwidth,keepaspectratio]{Pictures/newdevice}
|
|
\caption{Desktop application block device creation page mockup}
|
|
\label{fig:mockup-newdevice}
|
|
\end{figure}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[height=0.4\textheight,width=0.9\columnwidth,keepaspectratio]{Pictures/newfs}
|
|
\caption{Desktop application Izaro filesystem creation page mockup}
|
|
\label{fig:mockup-newfs}
|
|
\end{figure}
|
|
|
|
\section{Data interface}
|
|
\begin{itemize}[label={\Square}]
|
|
\item[] Supports actions
|
|
\begin{itemize}[label={\Square}]
|
|
\item Read
|
|
\item Write
|
|
\item Update
|
|
\item Allocate
|
|
\item Allocate and push
|
|
\item Read and push
|
|
\item Pop and write
|
|
\item Delete
|
|
\item "Delete" and push
|
|
\item Assert timestamp
|
|
\item Commit
|
|
\end{itemize}
|
|
\item Can execute a chain of actions
|
|
\item Can confirm a chain of actions
|
|
\item Can clear unconfirmed actions
|
|
\item Can execute a single action and confirm it
|
|
\item Can read usage statistics
|
|
\item[] Shared systems
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can lock and unlock a named mutex
|
|
\item Can compare and swap a semaphore
|
|
\end{itemize}
|
|
\item[] Administration
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can read global statistics
|
|
\item Can read list of UUIDs
|
|
\item Can list users
|
|
\item Can list connections
|
|
\item Can wipe account
|
|
\item Can register a backend
|
|
\end{itemize}
|
|
\item[] Automation
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can replace a dead server
|
|
\item Will duplicate if spares available, preferring $A \rightarrow B \rightarrow AB$
|
|
\end{itemize}
|
|
\end{itemize}
|
|
|
|
\chapter{Back-end capabilities}
|
|
\section{GoJ Database}
|
|
\begin{itemize}[label={\Square}]
|
|
\item[] Data
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can read a record
|
|
\item Can write a record
|
|
\item Can confirm a record
|
|
\item Can remove a record
|
|
\item Can test existence of a record
|
|
\item Can allocate a record
|
|
\item Can read a record metadata
|
|
\end{itemize}
|
|
\item[] Stats
|
|
\begin{itemize}[label={\Square}]
|
|
\item Can provide stats given the stats format
|
|
\item Can alert in case of suspicious stats
|
|
\end{itemize}
|
|
\item Can stream a list of all records, then stream all transformations that happened since the streaming started
|
|
\end{itemize}
|
|
|
|
\section{Performance requirements}
|
|
\begin{itemize}[label={\Square}]
|
|
\item Less than 20\% below the server disk performance in terms of latency
|
|
\item Assumes available RAM to be at least 0.4\% of the disk capacity
|
|
\end{itemize}
|
|
|
|
\part{Technical specification}
|
|
|
|
\chapter{Storage layer}
|
|
\section{\texttt{izaro-storage}}
|
|
\section{\texttt{db\_{}stats}}
|
|
|
|
\chapter{Coordination layer}
|
|
\section{Client to \texttt{izaro-coordinate}}
|
|
\section{\texttt{izaro-coordinate} to \texttt{izaro-storage}}
|
|
|
|
\chapter{Time synchronization}
|
|
\section{Steadiness requirement}
|
|
\section{Storage side requirement}
|
|
|
|
\chapter{Client side}
|
|
\section{Key blocks and system root layout}
|
|
\section{Command line user interface}
|
|
\section{Graphical user interface}
|
|
|
|
\chapter{Block storage system}
|
|
|
|
\chapter{Native file system}
|
|
|
|
\appendix
|
|
|
|
\part{Annexes}
|
|
%\setcounter{chapter}{1}
|
|
%\renewcommand\thechapter{\Alph{chapter}}
|
|
|
|
\chapter{Encryption popularized}
|
|
\label{annex:encryption_popularized}
|
|
|
|
\textbf{Encryption: }The act to transform a message into a random looking cipher-text. The original message is often named the plain-text.
|
|
|
|
\hspace{-1.4em}\textbf{Cipher: }The mathematical function that transforms a plain-text into a cipher-text
|
|
|
|
\vspace{1.5em}\hspace{-1.4em}Encrypting data can be done in various ways. Each way have its properties and its resistances to certain types of attacks. All modern cryptography is key-based cryptography. It means that the way we encrypt data is not secret, what is secret is a value, named the key, that is used to encrypt the data.
|
|
|
|
\section{Properties of encryption}
|
|
|
|
We will here explain some of the properties a cipher can hold.
|
|
|
|
\subsection{Resistance}
|
|
|
|
A cipher generally have its resistance expressed as a power of two (e.g.: $2^{103}$) or as a number of bits of entropy (e.g.: $103~bits$). It is to be noted that this scale is not linear: it is exponential.
|
|
|
|
This means that a cipher that have $104~bits$ of entropy is 2 times harder to break than one with a resistance of $103~bits$ of the same family. Comparing resistance between different families is not relevant.
|
|
|
|
\subsection{Compactness}
|
|
|
|
Compactness of a cipher means that if you encrypt a message of side $n$ you will obtain a cipher-text of the same size. Conversely, If a cipher can generates a longer cipher-text than its message, it is said to be not compact.
|
|
|
|
for example, let's consider a simple cipher: for a message $A$, read it as a number and multiply it with a value that will be the key.
|
|
|
|
If your message is for example 8 digits, like $00005555$ and the key is $12345678$, the cipher-text will be equal to $5555 \times 12345678 = 68580241290$ which make a 11 digits cipher-text from a 8 digit message.
|
|
|
|
\subsection{Homomorphism}
|
|
|
|
Homomorphism means that for a message $A$ and an operation $f : x$ (for example, if $f : x \rightarrow x \times 2$ means the operation of multiplying by 2), if you apply a cipher to $A$ and get a cipher-text $B$, there exist a way to apply $f : x$ to $B$ in such a way that decryption of $B$ gives you the result of applying $f : x$ to $A$.
|
|
|
|
Expressed more simply, it means the you can execute operations on encrypted data without requiring to decipher it or understand it. Very few encryption mechanisms are fully homomorphic and those are mostly in research\autocite{Gentry:2009:FHE:1834954}.
|
|
|
|
\section{Types of encryption}
|
|
|
|
Encryption can express itself in different forms regarding to its way to handle the cryptographic key. Some have only one key, that must be known for encrypting and decrypting the data, we call those symmetrical ciphers; some have two keys, one for encrypting and one for decrypting, we call those asymmetrical ciphers.
|
|
|
|
\subsection{Symmetrical encryption}
|
|
|
|
\begin{figure}[h]
|
|
\begin{center}
|
|
\begin{itemize}
|
|
\item AES
|
|
\item Chacha20
|
|
\item Blowfish
|
|
\item Serpent
|
|
\item Twofish
|
|
\item CAST5
|
|
\item RC4
|
|
\item DES
|
|
\item 3DES
|
|
\item Skipjack
|
|
\item IDEA
|
|
\end{itemize}
|
|
\end{center}
|
|
\caption{List of symmetrical ciphers}
|
|
\label{fig:sym_ciphers}
|
|
\end{figure}
|
|
|
|
\subsection{Asymmetrical encryption}
|
|
|
|
\begin{figure}[h]
|
|
\begin{center}
|
|
\begin{itemize}
|
|
\item Prime numbers based (RSA)
|
|
\item Elliptic curve based (ECDSA)
|
|
\item Paillier crypto system
|
|
\item Lattice based (NTRU, BLISS\autocite{Gentry:2009:FHE:1834954})
|
|
\end{itemize}
|
|
\end{center}
|
|
\caption{List of asymmetrical ciphers}
|
|
\label{fig:asym_ciphers}
|
|
\end{figure}
|
|
|
|
\chapter{Protocols}
|
|
|
|
\section{\texttt{izaro-storage} queries}
|
|
|
|
\begin{figure}[h!]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=1.06em, bitformatting={\tiny\ttfamily}]{32}
|
|
|
|
\bitheader{0-31} \\
|
|
\bitbox{14}{\texttt{unused}}
|
|
\bitbox{1}{\texttt{\footnotesize 2S}}
|
|
\bitbox{1}{\texttt{\footnotesize B}}
|
|
\bitbox{16}{\texttt{operation code (big endian)}} \\
|
|
\wordbox{2}{\texttt{request identifier}} \\
|
|
\wordbox{1}{\textit{optional\ldots{}} \texttt{continuation}}
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] \texttt{2S}: the operation is two-stepped (requires a confirmation to be applied) if set
|
|
\item[] \texttt{B}: the operation is a bulk operation if set
|
|
\end{itemize}
|
|
\caption{Common request format (storage)}
|
|
\label{fig:common_format_storage}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{bytefield}[bitwidth=1.06em]{32}
|
|
|
|
\bitheader{0-31} \\
|
|
\wordbox{1}{$x$ (Big endian integer)} \\
|
|
\wordbox{1}{$y$ (Big endian integer)} \\
|
|
\wordbox{4}{\texttt{UUID}}
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] a zeroed UUID means the record is invalid
|
|
\end{itemize}
|
|
\caption{Record identifier format}
|
|
\label{fig:record_identifier}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{2}{\texttt{Common request format (storage)}} \\
|
|
\wordbox{3}{\texttt{Record identifier}} \\
|
|
\wordbox{1}{\textit{optional\ldots{}} \texttt{Timestamp (Big endian integer)}} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] Timestamp of 0 or absent means latest valid value
|
|
\end{itemize}
|
|
\caption{Request format for read (storage)}
|
|
\label{fig:read_format_storage}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.46em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{2}{\texttt{Common request format (storage)}} \\
|
|
\wordbox{3}{\texttt{Record identifier}} \\
|
|
|
|
\begin{rightwordgroup}{$16Kio$}
|
|
\wordbox[lrt]{1}{\texttt{Database page}} \\
|
|
\skippedwords \\\wordbox[lrb]{1}{}
|
|
\end{rightwordgroup} \\
|
|
\wordbox{1}{\textit{optional\ldots{}} \texttt{Timestamp (Big endian integer)}}
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] Timestamp of 0, absent, of maxed as of \footnotesize{\texttt{std::numeric\_limits<uint64\_t>::max()}} means server time should be used
|
|
\end{itemize}
|
|
\caption{Request format for write (storage)}
|
|
\label{fig:write_format_storage}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{2}{\texttt{Common request format (storage)}} \\
|
|
\wordbox{3}{\texttt{Record identifier}} \\
|
|
\wordbox{1}{\texttt{Timestamp (Big endian integer)}} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] An invalid timestamp does nothing
|
|
\end{itemize}
|
|
\caption{Request format for confirm (storage)}
|
|
\label{fig:confirm_format_storage}
|
|
\end{figure}
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{2}{\texttt{Common request format (storage)}} \\
|
|
\wordbox{3}{\texttt{Record identifier}} \\
|
|
\wordbox{1}{\texttt{Timestamp (Big endian integer)}} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] An invalid timestamp does nothing, a zeroed timestamp or absent timestamp means remove all, a specific value removes the target value.
|
|
\end{itemize}
|
|
\caption{Request format for remove (storage)}
|
|
\label{fig:remove_format_storage}
|
|
\end{figure}
|
|
|
|
|
|
|
|
\begin{figure}
|
|
\centering
|
|
\begin{bytefield}[bitwidth=1.06em, bitformatting={\tiny\ttfamily}]{32}
|
|
|
|
\bitheader{0,31} \\
|
|
\wordbox{4}{\texttt{Common request format (storage)}} \\
|
|
\bitbox{32}{\texttt{Size}} \\
|
|
\begin{rightwordgroup}{Repeats \\ \texttt{Size} times}
|
|
\wordbox{6}{\texttt{Record identifier}} \\
|
|
\wordbox{2}{\texttt{Timestamp (Big endian integer)}}
|
|
\end{rightwordgroup} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] Timestamps are to be interpreted as in the read operation, with the exception they cannot be omitted
|
|
\end{itemize}
|
|
\caption{Request format for bulk read (storage)}
|
|
\label{fig:bulk_read_format_storage}
|
|
\end{figure}
|
|
|
|
\section{\texttt{izaro-storage} replies}
|
|
|
|
\begin{figure}[h!]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{3}{\texttt{Record identifier}} \\
|
|
\bitbox{64}{\texttt{Timestamp}} \\
|
|
\bitbox{64}{\texttt{Offset}} \\
|
|
\bitbox{30}{\texttt{Unused}}
|
|
\bitbox{1}{\texttt{\tiny R}}
|
|
\bitbox{1}{\texttt{\tiny C}}
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] \texttt{R}: is set if the record is removed (not eligible for cleanup)
|
|
\item[] \texttt{C}: is set if the record is confirmed
|
|
\end{itemize}
|
|
\caption{Record format (storage)}
|
|
\label{fig:record_storage}
|
|
\end{figure}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\bitbox{64}{\texttt{request identifier}} \\
|
|
\wordbox[lrt]{5}{\texttt{record information}} \\
|
|
|
|
\begin{rightwordgroup}{$16Kio$}
|
|
\bitbox[lrb]{32}{}
|
|
\bitbox[lrt]{32}{} \\
|
|
\wordbox[lr]{1}{\vspace{0.96em}\texttt{Database page}} \\
|
|
\skippedwords \\\wordbox[lrb]{1}{}
|
|
\end{rightwordgroup} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[-] the request identifier must be equal to the request identifier from the query
|
|
\item[-] in case of a write request or confirm request, the database page may be omitted or contain invalid information
|
|
\end{itemize}
|
|
\caption{Common reply format (storage)}
|
|
\label{fig:common_format_reply_storage}
|
|
\end{figure}
|
|
|
|
\begin{figure}[h!]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em, bitformatting={\tiny\ttfamily}]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\bitbox{64}{\texttt{request identifier}} \\
|
|
\begin{rightwordgroup}{$48o$}
|
|
\bitbox[lrt]{64}{} \\
|
|
\wordbox[lr]{1}{\vspace{0.96em}\texttt{unused}} \\
|
|
\skippedwords \\\wordbox[lrb]{1}{}
|
|
\end{rightwordgroup} \\
|
|
\bitbox{64}{\texttt{number of unused pages}} \\
|
|
\bitbox{64}{\texttt{number of number of free pages due to deletions}} \\
|
|
\bitbox{64}{\texttt{number of pages}} \\
|
|
\bitbox{64}{\texttt{total size of the record table}} \\
|
|
\bitbox{64}{\texttt{total size of the delete table}} \\
|
|
\bitbox{64}{\texttt{unreclaimable pages}} \\
|
|
\bitbox{64}{\texttt{available records in the table}} \\
|
|
\bitbox{64}{\texttt{configured synchronization rate}} \\
|
|
\bitbox{64}{\texttt{duration of the last synchronization}} \\
|
|
\bitbox{64}{\texttt{longest duration of synchronization}} \\
|
|
\bitbox{64}{\texttt{average duration of synchronizations}} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[-] all values are big endian
|
|
\item[-] all times are in \si{\micro\second}
|
|
\end{itemize}
|
|
\caption{Stats reply format (storage)}
|
|
\label{fig:stats_format_reply_storage}
|
|
\end{figure}
|
|
|
|
\section{\texttt{izaro-coordinate} queries}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{1}{\texttt{operation}} \\
|
|
\wordbox{2}{\texttt{UUID}} \\
|
|
\wordbox{2}{\texttt{token}} \\
|
|
\begin{rightwordgroup}{nonce}
|
|
\wordbox{1}{\texttt{request identifier}} \\
|
|
\wordbox{1}{\texttt{}}
|
|
\end{rightwordgroup} \\
|
|
\bitbox[lrt]{64}{} \\
|
|
\wordbox[lr]{1}{\vspace{0.96em}\texttt{payload}} \\
|
|
\skippedwords \\\wordbox[lrb]{1}{}
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] a zeroed UUID means administration packet
|
|
\end{itemize}
|
|
\caption{Data packet}
|
|
\label{fig:data_packet}
|
|
\end{figure}
|
|
|
|
\subsection{User payloads}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{3}{\texttt{record identifier}} \\
|
|
\wordbox{1}{\texttt{timestamp}} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] the data fetched is the last before the timestamp
|
|
\item[] the timestamp is aligned on the coordinator's timestamping
|
|
\item[] if the timestamp is omitted the last confirmed page is read
|
|
\end{itemize}
|
|
\caption{Read request}
|
|
\label{fig:read_request}
|
|
\end{figure}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\begin{rightwordgroup}{$32Kio$}
|
|
\bitbox[lrt]{64}{} \\
|
|
\wordbox[lr]{1}{\vspace{0.96em}\texttt{file page}} \\
|
|
\skippedwords \\\wordbox[lrb]{1}{}
|
|
\end{rightwordgroup} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] the data will be stored on the time of the server
|
|
\end{itemize}
|
|
\caption{Allocate$+$Write request}
|
|
\label{fig:allocate_write_request}
|
|
\end{figure}
|
|
|
|
\subsection{Root user payloads}
|
|
|
|
\section{\texttt{izaro-coordinate} replies}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{bytefield}[bitwidth=0.48em]{64}
|
|
|
|
\bitheader{0,31,63} \\
|
|
\wordbox{1}{\texttt{page count}} \\
|
|
\wordbox{3}{\texttt{page max}} \\
|
|
\wordbox{3}{\texttt{record identifier}} \\
|
|
\end{bytefield}
|
|
\begin{itemize}
|
|
\item[] the data will be stored on the time of the server
|
|
\end{itemize}
|
|
\caption{Userdata reply}
|
|
\label{fig:userdata_reply}
|
|
\end{figure}
|
|
|
|
\section{\texttt{izaro-coordinate} timing protocol and consensus}
|
|
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{sequencediagram}
|
|
\newthread{a}{: Client}
|
|
\newinst[7]{b}{: Server}
|
|
|
|
\mess[1]{a}{$t_{client}$}{b}
|
|
\mess[1]{b}{$t_{client},t_{server}$}{a}
|
|
\end{sequencediagram}
|
|
\caption{Izaro time synchronization}
|
|
\label{fig:time_proto}
|
|
\end{figure}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{sequencediagram}
|
|
\newthread{a}{: Client \#1}
|
|
\newinst[7]{b}{: Server}
|
|
|
|
\mess[1]{a}{$t_{1}$ data write}{b}
|
|
\mess[1]{b}{unconfirmed record}{a}
|
|
\begin{callself}{a}{wait $t_{sync}$}{return $t_{2}$}
|
|
\end{callself}
|
|
\mess[1]{a}{$t_{1} - t_{sync} < x < t_{2}$ unconfirmed read}{b}
|
|
\mess[1]{b}{same unconfirmed record}{a}
|
|
\mess[1]{a}{confirm record}{b}
|
|
\mess[1]{b}{confirmed record}{a}
|
|
|
|
\end{sequencediagram}
|
|
\caption{Izaro single user write confirmation}
|
|
\label{fig:confirmation_proto}
|
|
\end{figure}
|
|
|
|
\begin{figure}[h]
|
|
\centering
|
|
\begin{footnotesize}
|
|
\begin{sequencediagram}
|
|
\newinst{da}{: Client \#1}
|
|
\newinst[4]{dc}{: Server}
|
|
\newinst[4]{db}{: Client \#2}
|
|
|
|
\mess[1]{da}{$t_{1}$ data write}{dc}
|
|
\mess[1]{dc}{unconfirmed record $A$}{da}
|
|
\begin{callself}{da}{wait $t_{sync}$}{return $t_{2}$}
|
|
\postlevel\postlevel
|
|
\postlevel\postlevel
|
|
\end{callself}
|
|
\mess[1]{da}{$t_{1} - t_{sync} < x < t_{2}$ unconfirmed read}{dc}
|
|
\mess[1]{dc}{self unconfirmed record is first}{da}
|
|
\postlevel
|
|
\mess[1]{da}{confirm record}{dc}
|
|
\mess[1]{dc}{confirmed record}{da}
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
\prelevel\prelevel
|
|
|
|
\mess[1]{db}{$t_{1}'$ data write}{dc}
|
|
\mess[1]{dc}{unconfirmed record $B$}{db}
|
|
\begin{callself}{db}{wait $t_{sync}$}{return $t_{2}'$}
|
|
\postlevel\postlevel
|
|
\postlevel\postlevel
|
|
\end{callself}
|
|
\mess[1]{db}{$t_{1}'-t_{sync} < x < t_{2}'$ unconfirmed read}{dc}
|
|
\mess[1]{dc}{self unconfirmed record is not first}{db}
|
|
\begin{callself}{db}{retry}{}
|
|
\end{callself}
|
|
|
|
\end{sequencediagram}
|
|
\end{footnotesize}
|
|
\caption{Izaro dual user write confirmation and cancellation}
|
|
\label{fig:2user_confirmation_proto}
|
|
\end{figure}
|
|
|
|
\backmatter
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
% BIBLIOGRAPHY
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
\chapter*{Bibliography}
|
|
\addcontentsline{toc}{chapter}{\textcolor{ocre}{Bibliography}} % Add a Bibliography heading to the table of contents
|
|
|
|
%------------------------------------------------
|
|
|
|
%\section*{Books}
|
|
%\addcontentsline{toc}{section}{Books}
|
|
\printbibliography[heading=bibempty]
|
|
|
|
\printglossary
|
|
|
|
%------------------------------------------------
|
|
|
|
\listoftables
|
|
|
|
%------------------------------------------------
|
|
|
|
\listoffigures
|
|
|
|
%------------------------------------------------
|
|
%\section*{Books}
|
|
%\addcontentsline{toc}{section}{Books}
|
|
%\printbibliography[heading=bibempty,type=book]
|
|
|
|
%----------------------------------------------------------------------------------------
|
|
% INDEX
|
|
%----------------------------------------------------------------------------------------
|
|
|
|
\cleardoublepage % Make sure the index starts on an odd (right side) page
|
|
\phantomsection
|
|
\setlength{\columnsep}{0.75cm} % Space between the 2 columns of the index
|
|
\addcontentsline{toc}{chapter}{\textcolor{ocre}{Index}} % Add an Index heading to the table of contents
|
|
\printindex % Output the index
|
|
|
|
\end{document}
|