\documentclass{beamer}

\mode<presentation>
{
  \usetheme{Warsaw}
}

\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{pgf}
\usepackage{fancybox}
\usepackage{graphicx}
\title{How to Build an Open Source FPGA Toolchain}
\author{S\'ebastien Bourdeauducq}
\institute{Milkymist RTC}
\date{International Conference on Accelerator and Large Experimental Physics Control Systems \\ --- \\ Open Hardware Workshop \\ --- \\ October 9th 2011 }

\begin{document}

\begin{frame}
  \titlepage
\end{frame}

\section{Common issues}

\begin{frame}
\frametitle{Common issues}
\begin{block}{Problem}
Instead of developing independent tools, you want to talk FPGA companies into opening their source code.
\end{block}

\pause
\begin{block}{Solution}
They won't. Save your breath and start coding.
\end{block}
\end{frame}

\begin{frame}
\frametitle{Common issues}
\begin{block}{Problem}
You think that FPGA companies do not publish the information needed to write a toolchain and reverse engineering seems impossible.
\end{block}

\pause
\begin{block}{Solution}
Find out that FPGA companies do \textbf{publish a lot of information} (netlist documentation, raw chip structure including interconnect, parts of the timing model). Look closer at the secret Xilinx bitstream format, it is surprisingly \textbf{straightforward and easy} to reverse engineer.
\end{block}
\end{frame}

\begin{frame}
\frametitle{Common issues}
\begin{block}{Problem}
You do not want to work for free for uncooperative FPGA companies. Reverse engineering seems to be a waste of time as FPGA chips may quickly become obsolete.
\end{block}

\pause
\begin{block}{Solution A}
Found your own FPGA startup.
\end{block}

\pause
\begin{block}{Solution B}
Bite the bullet and consider that a very large part of the toolchain infrastructure can be \textbf{generic} and \textbf{portable} to many FPGA families -- and hopefully to an open one someday.
\end{block}
\end{frame}

\begin{frame}
\frametitle{Common issues}
\begin{block}{Problem}
You are afraid of destroying FPGAs while developing your toolchain.
\end{block}

\pause
\begin{block}{Solution}
\begin{itemize}
\item Risks of damaging modern FPGAs are \textbf{overrated}. Did you know that temporary internal short circuits occur even with the regular Xilinx flow?
\item Test first on \textbf{cheap} FPGAs, some are easily available for less than \$10.
\item No risk, no fun!
\end{itemize}
\end{block}
\end{frame}

\begin{frame}
\frametitle{Common issues}
\begin{block}{Problem}
Building a FPGA toolchain seems impossibly hard and you don't know where to begin.
\end{block}

\pause
\begin{block}{Solution}
\begin{itemize}
\item \textbf{Study} in detail how the existing FPGA toolchains work.
\item Replace the proprietary tools \textbf{one by one} and consider \textbf{compatibility} with them.
\item \textbf{Perfection is unattainable}. Accept being suboptimal in many cases (like the proprietary toolchain).
\item Start with a \textbf{very crude} toolchain and improve it later.
\end{itemize}
\end{block}
\end{frame}

\begin{frame}
\frametitle{Common issues}
\begin{block}{Problem}
You know very well how FPGA toolchains work, and it still seems too large of a task for you.
\end{block}

\pause
\begin{block}{Solution}
Projects like Linux and GNU have been in the same situation. Start \textbf{crude, small and modular}, and adopt an \textbf{efficient open source} development model.
\end{block}
\end{frame}

\section{The big picture}
\begin{frame}
\frametitle{The big picture}
\begin{pgfpicture}{0cm}{0cm}{\textwidth}{6cm}
\pgfsetlinewidth{1.2pt}
\pgfnodebox{syn}[stroke]{\pgfxy(5,5)}{Synthesis}{2pt}{2pt}
\pgfnodebox{pck}[stroke]{\pgfxy(5,4)}{Packing}{2pt}{2pt}
\pgfnodebox{pla}[stroke]{\pgfxy(5,3)}{Placement}{2pt}{2pt}
\pgfnodebox{rou}[stroke]{\pgfxy(5,2)}{Routing}{2pt}{2pt}
\pgfnodebox{bit}[stroke]{\pgfxy(5,1)}{Bitstream encoding}{2pt}{2pt}
\pgfnodesetsepstart{0pt}
\pgfnodesetsepend{0pt}
\pgfsetendarrow{\pgfarrowto}
\pgfnodeconnline{syn}{pck}
\pgfnodeconnline{pck}{pla}
\pgfnodeconnline{pla}{rou}
\pgfnodeconnline{rou}{bit}
\end{pgfpicture}
\end{frame}

\section{FPGA tools cookbook}

\subsection{Synthesis}
\begin{frame}[fragile]
\frametitle{Synthesis}
\begin{columns}
\column{.5\textwidth}
\begin{verbatim}
module foo(input clk,
  input a, input b,
  output x);
always @(posedge clk)
  x <= a & b;
endmodule
\end{verbatim}
\column{.5\textwidth}
\begin{pgfpicture}{0cm}{0cm}{\textwidth}{4cm}
\pgfsetlinewidth{1.2pt}
\pgfnodebox{ibufa}[stroke]{\pgfxy(0,3)}{IBUF[a]}{2pt}{2pt}
\pgfnodebox{ibufb}[stroke]{\pgfxy(0,2)}{IBUF[b]}{2pt}{2pt}
\pgfnodebox{ibufg}[stroke]{\pgfxy(0,1)}{IBUFG[clk]}{2pt}{2pt}
\pgfnodebox{lut2}[stroke]{\pgfxy(2,3)}{LUT2[1000]}{2pt}{2pt}
\pgfnodebox{fd}[stroke]{\pgfxy(2,2)}{FD}{2pt}{2pt}
\pgfnodebox{obuf}[stroke]{\pgfxy(4,2)}{OBUF[x]}{2pt}{2pt}
\pgfnodesetsepstart{0pt}
\pgfnodesetsepend{0pt}
\pgfsetendarrow{\pgfarrowto}
\pgfnodeconnline{ibufa}{lut2}
\pgfnodeconnline{ibufb}{lut2}
\pgfnodeconnline{lut2}{fd}
\pgfnodeconnline{ibufg}{fd}
\pgfnodeconnline{fd}{obuf}
\end{pgfpicture}
\end{columns}

\center\shadowbox{Synthesis transforms *HDL sources into a graph of \textit{primitives}.}

It is called the \textit{netlist} and contains no information about the physical locations of logic and routes on the chip.
\end{frame}

\begin{frame}[fragile]
\frametitle{A rather transparent process}
\begin{itemize}
\item Xst netlists in proprietary NGC format can be exported to ``human-readable'' EDIF (\verb^ngc2edif^).
\item EDIF netlists can be converted back to NGC and used with all Xilinx tools (\verb^edif2ngc^).
\item Verilog and VHDL netlists (instantiating primitives) can be extracted from NGC files (\verb^netgen^).
\item EDIF and NGC netlists can be viewed in PlanAhead.
\item Most primitives are documented by Xilinx.
\item Behavioral Verilog models are published for all primitives (UNISIM).
\end{itemize}
\center\shadowbox{A synthesizer should not be harder to write than a C compiler.}
\end{frame}

\subsection{Packing}
\begin{frame}
\frametitle{Why packing?}
\begin{figure}
\centering \includegraphics[height=7cm]{slicex.png}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Packing challenges}
\begin{itemize}
\item \textit{Control set restrictions}: some signals (clock, reset, clock enable) are shared between all basic elements (LUTs, flip-flops) in the slice.
\item Primitives packed together should be chosen wisely to reduce routing delays and maximize performance.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{A simple packing algorithm}
\begin{enumerate}
\item Split the input list $L$ of unpacked LUT/FFs into lists $L_{1}...L_{N}$, according to control sets.
\item While at least one of $L_{1}...L_{N}$ is not empty:
\begin{enumerate}
\item Pick a random LUT/FF from a random list $L_{x}$ and pack it in a new slice $S$.
\item While $S$ is not full and $L_{x}$ is not empty:
\begin{enumerate}
\item For each LUT/FF in $L_{x}$, count the number of inputs it has in common with the LUT/FFs already packed in $S$.
\item Pick the LUT/FF with the most common inputs and pack it in $S$. (NB: If there are too few common inputs, we may want to start a new slice instead)
\end{enumerate}
\end{enumerate}
\item Return the graph of interconnected slices.
\end{enumerate}
\end{frame}

\subsection{Placement}
\begin{frame}[fragile]
\frametitle{Placement}
\begin{itemize}
\item The next step is to place the different \textit{sites} (slices, ...) on suitable locations on the chip.
\begin{itemize}
\item For Xilinx, the detailed list of sites is available with \verb^xdl -report <chip>^ and FPGA Editor.
\end{itemize}
\item Try to minimize routing length/delay.
\item Since we do not know the routing yet, we have to use a heuristic.
\begin{itemize}
\item Example: sum of the Manhattan distances between connected endpoints.
\end{itemize}
\item First approach: place sites arbitrarily, then try to improve incrementally (hillclimbing).
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{The problem with hillclimbing}
\begin{figure}
\centering \includegraphics[height=6cm]{local.pdf}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Simulated annealing}
\begin{itemize}
\item Allow some moves that \textbf{increase cost}, to be capable of \textbf{leaving local minima}.
\item Progressively reduce the probability of acceptance of cost-increasing moves.
\item Analogy with annealing in metallurgy: reduction of crystal defects by slow cooling.
\item The parameter that controls the acceptance of cost-increasing moves is called \textit{temperature}.
\item $P_{accepted} = e^{-\frac{\Delta cost}{T}}$ if $\Delta cost > 0$, $P_{accepted} = 1$ otherwise.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{A simple placement algorithm}
\begin{itemize}
\item Place all sites randomly.
\item While the solution is not good enough:
\begin{itemize}
\item Pick a random move and compute its cost variation (i.e.\ difference in the heuristic function to be optimized).
\item If $P_{accepted}(\Delta cost,T) > random(0,1)$, accept the move.
\item Update the temperature T.
\end{itemize}
\end{itemize}

\center \shadowbox{For more details, see for example VPR.}

\end{frame}

\subsection{Routing}
\begin{frame}
\frametitle{Routing overview}
\begin{itemize}
\item Purpose: establish the connections between the placed sites.
\item \textbf{One output} pin of a site drives \textbf{one or several input} pins.
\item Chips contain a network of \textit{wires} and unidirectional switches called \textit{PIPs (Programmable Interconnect Points)} can connect two wires together.
\item Site I/Os are permanently attached to special \textit{pinwires} (on which PIPs are present to continue routing).
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{Understanding the switch network}
\begin{itemize}
\item All PIPs and wires can be listed with \verb^xdl -report -pips <chip>^
\item Very large text output (1GB for medium-small FPGAs).
\item Fully expanded description of a quite \textbf{regular} architecture.
\item Many switching \textbf{structures have equivalent} PIP/wire graphs.
\item Need to \textbf{identify all types} of switching structures present in a FPGA family:
\begin{itemize}
\item smaller ``factored'' database
\item faster tool runtime, less memory utilization
\item enables interconnect timing models to be built
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{A simple routing algorithm}
\begin{enumerate}
\item $P_{s} = pinwire(source); P_{d} = pinwires(destinations);$
\item $R = \{P_{s}\}$
\item While $P_{d} \not\subset R$:
\begin{enumerate}
\item For all wires $w$ in $R$, for all PIPs $p$ departing from $w$ with $destination(p) \notin R$ and $destination(p)$ is available:
\begin{enumerate}
\item $R = R \cup \{destination(p)\}$
\item $H[destination(p)] = p$
\end{enumerate}
\end{enumerate}
\item $L=\{\}$
\item For all pinwires $d$ in $P_{d}$:
\begin{enumerate}
\item $c=d$; while $c \neq P_{s}$:
\begin{enumerate}
\item $L = L \cup \{H[c]\}$
\item $c = source(H[c])$
\end{enumerate}
\end{enumerate}
\item Return the set of PIPs to use $L$.
\end{enumerate}
\end{frame}

\subsection{Bitstream format}
\begin{frame}[fragile]
\frametitle{Bitstream format}
\begin{itemize}
\item General structure is regular and sparsely documented.
\item Site configuration encoding:
\begin{itemize}
\item Straightforward black-box reverse engineering.
\item Change with FPGA Editor or \verb^xdl^, examine difference in output.
\end{itemize}
\item Interconnect configuration encoding:
\begin{itemize}
\item DRC won't let you turn on individual PIPs.
\item Crack software to remove this restriction (if possible).
\item Or use set operations to isolate bits (Debit method).
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{PIP to bit mapping}
Simple basic rule: each wire is driven by a single multiplexer controlled by a 1-hot, 2-hot or 3-hot word.

\begin{figure}
\centering \includegraphics[height=4cm]{pipmux.png}
\end{figure}

\end{frame}


\section{Conclusions}

\subsection{Resources}
\begin{frame}
\frametitle{ODIN II, ABC}
Academic tools for synthesis, open source licenses.

Only target theoretic architectures.

\url{http://www.eecg.toronto.edu/vtr/}
\end{frame}

\begin{frame}
\frametitle{VPR}
Academic tool for packing, placement and routing. Non-free, but can serve as inspiration.

Only targets theoretic architectures.

\url{http://www.eecg.utoronto.ca/vpr/}
\end{frame}

\begin{frame}
\frametitle{RapidSmith}
Academic software similar to FPGA Editor. Open source but written in Java.

Based on XDL and can be used with real chip and the proprietary Xilinx bitstream generator.

\url{http://rapidsmith.sourceforge.net/}
\end{frame}

\begin{frame}
\frametitle{Debit}
A pioneering attempt at making sense of the Xilinx bitstream format.

\url{http://www.ulogic.org/}
\end{frame}

\begin{frame}
\frametitle{ReCoBus tools}
ReCoBus has made more thorough bitstream format analyses. Software is non-free, but very interesting publications.

\url{http://www.recobus.de/}
\end{frame}

\begin{frame}
\frametitle{Leading an open source project}
Linus Torvalds's opinions are generally interesting...

\url{http://h30565.www3.hp.com/t5/Feature-Articles/Linus-Torvalds-s-Lessons-on}\\
\url{-Software-Development-Management/ba-p/440}
\end{frame}

\subsection{Conclusion}
\begin{frame}
\frametitle{Conclusion}
\center\shadowbox{An open source FPGA toolchain is possible, it just needs some work.}

These slides online and more:\\
\url{http://www.milkymist.org/fpgatools}\\
Thank you for your attention!

\end{frame}

\subsection{Going further}
\begin{frame}
\frametitle{Going further}
\begin{columns}
\column{.5\textwidth}
\begin{itemize}
\item Synthesis
\begin{itemize}
\item High performance LUT mappers
\item Resource sharing
\item Retiming
\item FSM extraction
\item Memory extraction
\item Carry chains (arithmetic, comparators, ...)
\item Multiplier blocks
\end{itemize}
\end{itemize}

\column{.5\textwidth}
\begin{itemize}
\item Physical implementation
\begin{itemize}
\item Timing model
\item Timing-driven routing
\item Congestion management
\item Post-placement packing
\item Analytical global placers
\item Relative placement constraints (e.g.\ carry chains)
\item Floorplanning
\item Partial reconfiguration
\end{itemize}
\end{itemize}
\end{columns}

\center\shadowbox{Let's worry about those later...}

\end{frame}


\end{document}
