The Collection of
Computer Science Bibliographies

Search results (within Papers and talks produced by the Institut für Computersprachen, TU Wien):


RSS feed
Permalink

Search for:
(online) sort by:    
Returned: 146 Documents (sorted by year)

@Unpublished{ertl19ft,
  author =	"M. Anton Ertl and Bernd Paysan",
  title =	"Der neue {Gforth}-Header",
  note = 	"Vortrag bei der Forth-Tagung 2019",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl19ft.pdf",
  video =	"https://forth-ev.de/wiki/events:ft2019:anton-header",
  year = 	"2019",
}
@Unpublished{ertl19ft2,
  author =	"M. Anton Ertl",
  title =	"Forth-Quellcode im Flash",
  note = 	"Vortrag bei der Forth-Tagung 2019",
  URL =  	"https://forth-ev.de/wiki/events:ft2019:anton-flash",
  year = 	"2019",
  usenetmsgid =  "<2019Apr19.185811@mips.complang.tuwien.ac.at>",
}
@InProceedings{paysan&ertl19,
  author =	"Bernd Paysan and M. Anton Ertl",
  title =	"The new {Gforth} Header",
  booktitle =	"35th EuroForth Conference",
  year = 	"2019",
  pages =	"5--20",
  URL =  	"http://www.euroforth.org/ef19/papers/paysan.pdf",
  slides-url =	"http://www.euroforth.org/ef19/papers/paysan-slides.pdf",
  video =	"https://wiki.forth-ev.de/doku.php/events:ef2019:header",
  OPTnote =	"refereed",
  abstract =	"The new Gforth header is designed to directly
		 implement the requirements of Forth-94 and Forth-2012.
		 Every header is an object with a fixed set of fields
		 (code, parameter, count, name, link) and methods
		 (\texttt{execute}, \texttt{compile}, \texttt{(to)},
		 \texttt{defer@}, \texttt{does},
		 \texttt{name>interpret}, \texttt{name>compile},
		 \texttt{name>string}, \texttt{name>link}). The
		 implementation of each method can be changed per-word
		 (prototype-based object-oriented programming). We
		 demonstrate how to use these features to implement
		 optimization of constants, \texttt{fvalue},
		 \texttt{defer}, \texttt{immediate}, \texttt{to} and
		 other dual-semantics words, and \texttt{synonym}.",
}
@Misc{ef2019:rs-int,
  author =	"M. Anton Ertl",
  title =	"Interactive multiline \code{>R R>} in {Gforth}",
  howpublished = "https://wiki.forth-ev.de/doku.php/events:ef2019:rs-int",
  year = 	"2019",
  note = 	"Video of otherwise unpublished talk at EuroForth
		 2019",
}
@InProceedings{ertl19kps,
  author =	"M. Anton Ertl",
  title =	"Integer Division by Multiplying with the Double-Width
		 Reciprocal",
  booktitle =	"20. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS)",
  year = 	"2019",
  editor =	"Martin Pl{\"u}micke and Fayez Abu Alia",
  pages =	"75--84",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl19kps.pdf",
  slides-url =	"http://www.complang.tuwien.ac.at/papers/ertl19kps-slides.pdf",
  abstract =	"Earlier work on integer division by multiplying with
		 the reciprocal has focused on multiplying with a
		 single-width reciprocal, combined with a correction and
		 followed by a shift. The present work explores using a
		 double-width reciprocal to allow getting rid of the
		 correction and shift.",
}
@InProceedings{thier+18,
  author =	"Patrick Thier and M. Anton Ertl and Andreas Krall",
  title =	"Fast and Flexible Instruction Selection with
		 Constraints",
  booktitle =	"Compiler Construction (CC'18)",
  year = 	"2018",
  pages =	"93--103",
  URL =  	"http://www.complang.tuwien.ac.at/papers/thier+18.pdf",
  DOI =  	"10.1145/3178372.3179501",
  abstract =	"Tree-parsing instruction selection as used in, e.g.,
		 lcc, uses dynamic costs to gain flexibility and handle
		 situations (such as read-modify-write instructions)
		 that do not fit into the basic tree-parsing model. The
		 disadvantage of dynamic costs is that we can no longer
		 turn the tree grammar into a tree automaton (as is done
		 by burg) for fast instruction selection for JIT
		 compilers. In this paper we introduce constraints that
		 say whether a tree-grammar rule is applicable or not.
		 While theoretically less powerful than dynamic costs,
		 constraints cover the practical uses of dynamic costs;
		 more importantly, they allow turning the tree grammar
		 with constraints into a tree automaton (with
		 instruction-selection-time checks), resulting in faster
		 instruction selection than with pure
		 instruction-selection-time dynamic programming. We
		 integrate constraints in an instruction selector that
		 matches DAGs with tree rules. We evaluate this concept
		 in lcc and the CACAO JavaVM JIT compiler, and see
		 instruction selector speedups by a factor 1.33--1.89.",
  doi-url =	"http://dx.doi.org/10.1145/3178372.3179501",
}
@Unpublished{ertl18ft,
  author =	"M. Anton Ertl",
  title =	"Verallgemeinerung von locals",
  note = 	"Vortrag bei der Forth-Tagung 2018",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl18ft.pdf",
  video =	"https://wiki.forth-ev.de/doku.php/events:tagung-2018:locals",
  year = 	"2018",
}
@InProceedings{ertl18manlang,
  author =	"M. Anton Ertl",
  title =	"Software Vector Chaining",
  booktitle =	"15th International Conference on Managed Languages &
		 Runtimes (Manlang'18)",
  year = 	"2018",
  pages =	"Article-18",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl18manlang.pdf",
  DOI =  	"10.1145/3237009.3237021",
  abstract =	"Providing vectors of run-time determined length as
		 opaque value types is a good interface between the
		 machine-level SIMD instructions and portable
		 application-oriented programming languages.
		 Implementing vector operations requires a loop that
		 breaks the vector into SIMD-register-sized chunks. A
		 compiler can fuse the loops of several vector
		 operations together. However, during normal compilation
		 this is only easy if no other control structures are
		 involved. This paper explores an alternative: collect a
		 trace of vector operations at run-time (following the
		 program control flow during this collecting step), and
		 then perform the combined vector loop. This arrangement
		 has a certain run-time overhead, but its implementation
		 is simpler and can happen independently, in a library.
		 Preliminary performance results indicate that the
		 overhead makes this approach beneficial only for long
		 vectors ($>1$KB). For shorter vectors, unfused loops
		 should be used in a library setting. Fortunately, this
		 choice can be made at run time, individually for each
		 vector operation.",
  doi-url =	"http://dx.doi.org/10.1145/3237009.3237021",
}
@InProceedings{ertl&paysan18,
  author =	"M. Anton Ertl and Bernd Paysan",
  title =	"Closures --- the {Forth} way",
  booktitle =	"34th EuroForth Conference",
  year = 	"2018",
  pages =	"17--30",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26paysan.pdf",
  url2 = 	"http://www.euroforth.org/ef18/papers/ertl.pdf",
  slides-url =	"http://www.euroforth.org/ef18/papers/ertl-slides.pdf",
  video =	"https://wiki.forth-ev.de/doku.php/events:ef2018:closures",
  OPTnote =	"refereed",
  abstract =	"In Forth 200x, a quotation cannot access a local
		 defined outside it, and therefore cannot be
		 parameterized in the definition that produces its
		 execution token. We present Forth closures; they lift
		 this restriction with minimal implementation
		 complexity. They are based on passing parameters on the
		 stack when producing the execution token. The
		 programmer has to explicitly manage the memory of the
		 closure. We show a number of usage examples. We also
		 present the current implementation, which takes
		 109~source lines of code (including some extra
		 features). The programmer can mechanically convert
		 lexical scoping (accessing a local defined outside)
		 into code using our closures, by applying assignment
		 conversion and flat-closure conversion. The result can
		 do everything one expects from closures, including
		 passing Knuth's man-or-boy test and living beyond the
		 end of their enclosing definitions.",
}
@Article{ertl18vd,
  author =	"M. Anton Ertl",
  title =	"Forth-200X-Treffen auf der EuroForth 2017",
  journal =	"Vierte Dimension",
  year = 	"2018",
  volume =	"34",
  number =	"3",
  pages =	"5--6",
}
@Unpublished{ertl17ft,
  author =	"M. Anton Ertl",
  title =	"{Statische Typ{\"u}berpr{\"u}fung}",
  note = 	"Vortrag bei der Forth-Tagung 2017",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl17ft.pdf",
  video =	"https://wiki.forth-ev.de/doku.php/events:tagung-2017:statische-typpruefung",
  year = 	"2017",
}
@InProceedings{ertl17,
  author =	"M. Anton Ertl",
  title =	"SIMD and Vectors",
  booktitle =	"33rd EuroForth Conference",
  year = 	"2017",
  pages =	"25--36",
  URL =  	"http://www.euroforth.org/ef17/papers/ertl.pdf",
  video =	"https://wiki.forth-ev.de/lib/exe/fetch.php/events:ef2017:simd-vectors.mp4",
  OPTnote =	"refereed",
  abstract =	"Many programs have parts with significant data
		 parallelism, and many CPUs provide SIMD instructions
		 for processing data-parallel parts faster. The weak
		 link in this chain is the programming language. We
		 propose a vector wordset so that Forth programmers can
		 make use of SIMD instructions to speed up the
		 data-parallel parts of their applications. The vector
		 wordset uses a separate vector stack containing opaque
		 vectors with run-time determined length. Preliminary
		 results using one benchmark show a factor~8 speedup of
		 a simple vector implementation over scalar Gforth code,
		 a smaller (factor 1.8) speedup over scalar VFX code;
		 another factor of 3 is possible on this benchmark with
		 a more sophisticated implementation. However, vectors
		 have an overhead; this overhead is amortized in this
		 benchmark at vector lengths between 3 and 250
		 (depending on which variants we compare).",
}
@InProceedings{ertl17kps,
  author =	"M. Anton Ertl",
  title =	"The Intended Meaning of \emph{Undefined Behaviour} in
		 {C} Programs",
  booktitle =	"19. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS'17)",
  year = 	"2017",
  editor =	"Wolfram Amme and Thomas Heinze",
  pages =	"20--28",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl17kps.pdf",
  url2 = 	"http://www.kps2017.uni-jena.de/proceedings/kps2017_submission_5.pdf",
  abstract =	"All significant C programs contain undefined
		 behaviour. There are conflicting positions about how to
		 deal with that: One position is that all these programs
		 are broken and may be compiled to arbitrary code.
		 Another position is that tested and working programs
		 should continue to work as intended by the programmer
		 with future versions of the same C compiler. In that
		 context the advocates of the first position often claim
		 that they do not know the intended meaning of a program
		 with undefined behaviour. This paper explores this
		 topic in greater depth. The goal is to preserve the
		 behaviour of existing, tested programs. It is achieved
		 by letting the compiler define a consistent mapping of
		 C operations to machine code; and the compiler then has
		 to stick to this behaviour during optimizations and in
		 future releases.",
}
@InProceedings{ertl-sections16,
  author =	"M. Anton Ertl",
  title =	"Sections",
  booktitle =	"32nd EuroForth Conference",
  year = 	"2016",
  pages =	"55--57",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl-sections16.pdf",
  video =	"https://wiki.forth-ev.de/lib/exe/fetch.php/events:sections.mp4",
  OPTnote =	"not refereed",
  abstract =	"A section is a contiguous region of memory, to which
		 data or code can be appended (like the Forth
		 dictionary). Assembly languages and linkers have
		 supported multiple sections for a long time. This paper
		 describes the benefits of supporting multiple sections
		 in Forth, interfaces and implementation techniques.",
}
@InProceedings{ertl-recognizers16,
  author =	"M. Anton Ertl",
  title =	"Recognizers: Arguments and Design Decisions",
  booktitle =	"32nd EuroForth Conference",
  year = 	"2016",
  pages =	"58--63",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl-recognizers16.pdf",
  video =	"https://wiki.forth-ev.de/lib/exe/fetch.php/events:recognizers.mp4",
  OPTnote =	"not refereed",
  abstract =	"The Forth text interpreter processes words and
		 numbers. Currently the set of words can be extended by
		 programmers, but not the recognized numbers.
		 User-defined recognizers allow to extend the
		 number-recognizer part, too. This paper shows the
		 benefits of recognizers and discusses counterarguments.
		 It also discusses several design decisions: Whether to
		 define temporary words, or a set of interpretation,
		 compilation, and postponing actions; and whether to
		 hook the recognizers inside \code{find} or in the text
		 interpreter.",
}
@InProceedings{ertl-secure16,
  author =	"M. Anton Ertl",
  title =	"Security",
  booktitle =	"32nd EuroForth Conference",
  year = 	"2016",
  pages =	"82--83",
  URL =  	"http://www.euroforth.org/ef16/papers/ertl-secure.pdf",
  video =	"https://wiki.forth-ev.de/lib/exe/fetch.php/events:security.mp4",
  OPTnote =	"presentation slides",
}
@Unpublished{ertl16ft-secure,
  author =	"M. Anton Ertl",
  title =	"Sicheres Forth",
  note = 	"Vortrag bei der Forth-Tagung 2016",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl16ft-secure.pdf",
  year = 	"2016",
  abstract =	"Buffer overflows sind ein beliebtes Einfallstor fuer
		 Angriffe auf Software. Forth vermeidet zwar einige
		 Probleme von C, aber es ist auch nicht dagegen gefeit.
		 Das Problem komplett zu eliminieren wuerde Forth zu
		 weit einschraenken, aber man kann die Programme ein
		 zwei Teile teilen: einen, der die unsicheren Features
		 verwenden darf und genau inspiziert werden muss, und
		 einen, der nur sichere Features verwenden darf und bei
		 dem man in der Inspektion nicht auf Buffer overflows
		 schauen muss.",
}
@Unpublished{ertl16ft-vectors,
  author =	"M. Anton Ertl",
  title =	"Gedanken zu SIMD und Vektorisierung",
  note = 	"Vortrag bei der Forth-Tagung 2016",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl16ft-vectors.pdf",
  year = 	"2016",
  abstract =	"Seit zwei Jahrzehnten stellen popul{\"a}re Prozessoren
		 SIMD-Erweiterungen wie SSE zur Verf{\"u}gung. Einen
		 portabler Assembler auf Basis von Forth sollte es auch
		 erm{\"o}glichen, diese Befehle zu nutzen.
		 Klassischerweise geht das {\"u}ber Intrinsics (sehr
		 architekturspezifisch) oder {\"u}ber automatische
		 Vektorisierung (sehr kompliziert). In diesem Vortrag
		 stelle ich einige {\"U}berlegungen zu dem Thema vor.
		 Ein zentrales Element dabei sind {\"U}berlegungen zur
		 Speicherverwaltung, um Komplikationen oder
		 Geschwindigkeitsnachteile durch Abh{\"a}ngigkeiten zu
		 vermeiden.",
}
@Article{ertl16vd,
  author =	"M. Anton Ertl",
  title =	"Forth-2012: Der neue Standard",
  journal =	"Vierte Dimension",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl16vd.pdf",
  year = 	"2016",
  volume =	"32",
  number =	"3",
  pages =	"13--18",
}
@Article{ertl15vd,
  author =	"M. Anton Ertl",
  title =	"Forth 200x­Treffen auf der EuroForth 2015",
  journal =	"Vierte Dimension",
  year = 	"2015",
  volume =	"31",
  number =	"3+4",
  pages =	"21",
}
@InProceedings{ertl15,
  author =	"M. Anton Ertl and Bernd Paysan",
  title =	"From \texttt{exit} to \texttt{set-does>} --- A Story
		 of {Gforth} Re-Implementation",
  booktitle =	"31st EuroForth Conference",
  year = 	"2015",
  pages =	"41--47",
  URL =  	"http://www.euroforth.org/ef15/papers/ertl.pdf",
  slides-url =	"http://www.euroforth.org/ef15/papers/ertl-slides.pdf",
  OPTnote =	"not refereed",
  abstract =	"We changed \code{exit} from an immediate to a
		 non-immediate word; this requires changes in the
		 de-allocation of locals, which leads to changes in the
		 implementation of colon definitions, and to
		 generalizing \code{does>} into \code{set-does>} which
		 allows the defined word to call arbitrary execution
		 tokens. The new implementation of locals cleanup can
		 usually be optimized to similar performance as the old
		 implementation. The new implementation of \code{does>}
		 has performance similar to the old implementation,
		 while using \code{set-does>} results in speedups in
		 certain cases.",
}
@InProceedings{ertl-recognizers15,
  author =	"M. Anton Ertl",
  title =	"Recognizers --- Why and How",
  booktitle =	"31st EuroForth Conference",
  year = 	"2015",
  pages =	"77--78",
  slides-url =	"http://www.euroforth.org/ef15/papers/ertl-recognizers-slides.pdf",
  OPTnote =	"presentation slides",
}
@InProceedings{ertl15kps,
  author =	"M. Anton Ertl",
  title =	"What every compiler writer should know about
		 programmers",
  booktitle =	"18. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS'15)",
  year = 	"2015",
  editor =	"Jens Knoop and M. Anton Ertl",
  pages =	"112--133",
  URL =  	"http://www.complang.tuwien.ac.at/kps2015/proceedings/KPS_2015_submission_29.pdf",
  abstract =	"In recent years C compiler writers have taken the
		 attitude that tested and working production (i.e.,
		 \emph{conforming} according to the C standard) C
		 programs are buggy if they contain undefined behavior,
		 and they feel free to compile these programs (except
		 benchmarks) in a way that they no longer work. The
		 justification for this attitude is that it allows C
		 compilers to optimize better. But while these
		 ``optimizations'' provide a factor 1.017 speedup with
		 Clang-3.1 on SPECint 2006, for non-benchmarks it also
		 has a cost: if we go by the wishes of the compiler
		 maintainers, we have to ``fix'' our working, conforming
		 C programs; this requires a substantial effort, and can
		 result in bigger and slower code. The effort could
		 otherwise be invested in source-level optimizations by
		 programmers, which can have a much bigger effect (e.g.,
		 a factor $>2.7$ for Jon Bentley's traveling salesman
		 program). Therefore, optimizations that assume that
		 undefined behavior does not exist are a bad idea not
		 just for security, but also for the performance of
		 non-benchmark programs.",
}
@Proceedings{kps15,
  title =	"18. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS'15)",
  booktitle =	"18. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS'15)",
  year = 	"2015",
  key =  	"KPS '15",
  editor =	"Jens Knoop and M. Anton Ertl",
  URL =  	"http://www.complang.tuwien.ac.at/kps2015/proceedings/",
  pdf-url =	"http://www.complang.tuwien.ac.at/kps2015/proceedings/kps15.pdf",
}
@Unpublished{ertl14ft,
  author =	"M. Anton Ertl",
  title =	"{Forth in "Grundlagen der Programmkonstruktion"}",
  note = 	"Vortrag bei der Forth-Tagung 2014",
  URL =  	"http://www.forth-ev.de/wiki/lib/exe/fetch.php/events:slides-anton.pdf",
  year = 	"2014",
}
@InProceedings{ertl14,
  author =	"M. Anton Ertl",
  title =	"Region-Based Memory Allocation in {Forth}",
  booktitle =	"30th EuroForth Conference",
  year = 	"2014",
  pages =	"45--49",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef14/papers/ertl.pdf",
  OPTnote =	"not refereed",
  abstract =	"Memory management has a pervasive effect on the way we
		 program. In region-based memory allocation, objects
		 with roughly the same life expectancy are allocated in
		 one region, and in the end the whole region is freed at
		 once. This avoids the need to keep track of the
		 individual objects for \texttt{free}. Regions are
		 simple to implement and compatible with real-time
		 requirements and multi-threading, and seem to be ideal
		 for Forth, except for one thing: The region id has to
		 be passed to the allocation word, increasing the stack
		 load. We propose using context wrappers to avoid that
		 problem. This even allows to use existing
		 \texttt{allocate}-based libraries with regions, but we
		 then have to decide what \texttt{free} and
		 \texttt{resize} inside these libraries do.",
}
@InProceedings{ertl14-c,
  author =	"M. Anton Ertl",
  title =	"How to get rid of {C}",
  booktitle =	"30th EuroForth Conference",
  year = 	"2014",
  pages =	"63--65",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef14/papers/ertl-c.pdf",
  video =	"http://www.forth-ev.de/wiki/doku.php/events:euroforth-2014:ridofc",
  OPTnote =	"Presentation slides",
}
@Unpublished{ertl13ft,
  author =	"M. Anton Ertl",
  title =	"{Forth als Basis f{\"u}r einen portablen Assembler}",
  note = 	"Vortrag bei der Forth-Tagung 2013",
  URL =  	"https://www.complang.tuwien.ac.at/anton/slides/forth-tagung13.pdf",
  year = 	"2013",
}
@InProceedings{ertl13-paf,
  author =	"M. Anton Ertl",
  title =	"{PAF}: A Portable Assembly Language",
  booktitle =	"29th EuroForth Conference",
  year = 	"2013",
  pages =	"30--38",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef13/papers/ertl-paf.pdf",
  OPTnote =	"not refereed",
  abstract =	"A portable assembly language provides access to
		 machine-level features like memory addresses, machine
		 words, code addresses, and modulo arithmetics, like
		 assembly language, but abstracts away differences
		 between architectures like the assembly language
		 syntax, instruction encoding, register set size, and
		 addressing modes. Forth already satisfies a number of
		 the characteristics of a portable assembly language,
		 and is therefore a good basis. This paper presents PAF,
		 a portable assembly language based on Forth, and
		 specifically discusses language features that other
		 portable assembly languages do not have, and their
		 benefits; it also discusses the differences from Forth.
		 The main innovations of PAF are: tags indicate the
		 control flow for indirect branches and calls; and PAF
		 has two kinds of calls and definitions: the ABI ones
		 follow the platform's calling convention and are useful
		 for interfacing to the outside world, while the PAF
		 ones allow tail-call elimination and are useful for
		 implementing general control structures.",
}
@InProceedings{ertl13-strings,
  author =	"M. Anton Ertl",
  title =	"Standardize Strings Now!",
  booktitle =	"29th EuroForth Conference",
  year = 	"2013",
  pages =	"39--43",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef13/papers/ertl-strings.pdf",
  OPTnote =	"not refereed",
  abstract =	"This paper looks at the issues in string words: what
		 operations may be required, various design options, and
		 why this has lead to the current state of
		 standardization of string operations that is
		 insufficient in the eyes of many.",
}
@InProceedings{ertl13-regions,
  author =	"M. Anton Ertl",
  title =	"Region-Based Memory Allocation",
  booktitle =	"29th EuroForth Conference",
  year = 	"2013",
  pages =	"65--67",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef13/papers/ertl-regions.pdf",
  OPTnote =	"Presentation slides",
}
@InProceedings{ertl13-kps,
  author =	"M. Anton Ertl",
  title =	"{PAF}: A Portable Assembly Language Based on {F}orth",
  booktitle =	"17. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS'13)",
  year = 	"2013",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl-kps13.ps.gz",
  abstract =	"A portable assembly language provides access to
		 machine-level features like memory addresses, machine
		 words, code addresses, and modulo arithmetics, like
		 assembly language, but abstracts away differences
		 between architectures like the assembly language
		 syntax, instruction encoding, register set size, and
		 addressing modes. Forth already satisfies a number of
		 the characteristics of a portable assembly language,
		 and is therefore a good basis. This paper presents PAF,
		 a portable assembly language based on Forth, and
		 specifically discusses language features that other
		 portable assembly languages do not have, and their
		 benefits; it also discusses the differences from Forth.
		 The main innovations of PAF are: tags indicate the
		 control flow for indirect branches and calls; and PAF
		 has two kinds of calls and definitions: the ABI ones
		 follow the platform's calling convention and are useful
		 for interfacing to the outside world, while the PAF
		 ones allow tail-call elimination and are useful for
		 implementing general control structures.",
}
@Unpublished{ertl12ft,
  author =	"M. Anton Ertl",
  title =	"objects2.fs: Ein modernisiertes objektorientiertes
		 Paket",
  note = 	"Vortrag bei der Forth-Tagung 2012",
  URL =  	"http://forth-ev.de/filemgmt/visit.php?lid=415",
  year = 	"2012",
}
@InProceedings{ertl12,
  author =	"M. Anton Ertl",
  title =	"Methods in objects2: Duck Typing and Performance",
  booktitle =	"28th EuroForth Conference",
  year = 	"2012",
  pages =	"96--103",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef12/papers/ertl.pdf",
  OPTnote =	"not refereed",
  abstract =	"The major new feature of objects2 is defining methods
		 for any class (like in Smalltalk): this means that we
		 can have two classes that are unrelated by inheritance,
		 yet react to the same messages and can be used in the
		 same contexts; this is also known as duck typing. This
		 paper discusses the implementation of method dispatch
		 for these general selectors as well as the more
		 restricted class selectors of the original
		 \code{objects.fs}, and compares the memory and
		 execution time costs of these method selector
		 implementations: Unhashed general selectors are as fast
		 as class selectors (down to two instructions), but can
		 consume a lot of memory (megabytes of dispatch tables
		 for large class hierarchies); hashed general selectors
		 are significantly slower ($\ge 43$ cycles), but consume
		 less memory. Programmers don't need to choose a
		 selector implementation up front; instead, it is easy
		 to switch between them later, on a per-selector
		 basis.",
}
@InProceedings{wodni&ertl11,
  author =	"Gerald Wodni and M. Anton Ertl",
  title =	"{SWIG} \& {The Forth Net}: Hands-On",
  booktitle =	"27th EuroForth Conference",
  year = 	"2011",
  pages =	"32--35",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef11/papers/wodni.pdf",
  OPTnote =	"not refereed",
  abstract =	"We have shown the basic functionality of SWIG and The
		 Forth Net in the past. Now we want to provide two basic
		 examples which explain how to use them and to show how
		 easy it is to create C-interfaces or Forth-libraries
		 and share them.",
}
@InProceedings{ertl11euroforth,
  author =	"M. Anton Ertl",
  title =	"Ways to Reduce the Stack Depth",
  booktitle =	"27th EuroForth Conference",
  year = 	"2011",
  pages =	"36--41",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl11euroforth.ps.gz",
  url2 = 	"http://www.complang.tuwien.ac.at/anton/euroforth/ef11/papers/ertl.pdf",
  OPTnote =	"not refereed",
  abstract =	"Having to deal with many different data can lead to
		 problems in Forth: The data stack is the preferred
		 place to store data; on the other hand, dealing with
		 too many data stack items is cumbersome and usually bad
		 style. This paper presents and discusses ways to
		 unburden the data stack; some of them are used widely,
		 others are almost unknown or new.",
}
@Unpublished{wodni&ertl11ft,
  author =	"Gerald Wodni and M. Anton Ertl",
  title =	"SWIG-Erweiterung f{\"u}r Forth",
  note = 	"Vortrag bei der Forth-Tagung 2011",
  URL =  	"http://forth-ev.de/filemgmt/visit.php?lid=354",
  year = 	"2011",
}
@Unpublished{ertl11ft-stack,
  author =	"M. Anton Ertl",
  title =	"Techniken f{\"u}r weniger Stack-Tiefe",
  note = 	"Vortrag bei der Forth-Tagung 2011",
  URL =  	"http://forth-ev.de/filemgmt/visit.php?lid=353",
  year = 	"2011",
}
@Unpublished{ertl11ft-multi,
  author =	"M. Anton Ertl",
  title =	"Multi-Threading und Multi-Tasking in Gforth",
  note = 	"Vortrag bei der Forth-Tagung 2011",
  URL =  	"http://forth-ev.de/filemgmt/visit.php?lid=351",
  year = 	"2011",
}
@Unpublished{ertl11ft-string,
  author =	"M. Anton Ertl",
  title =	"Ausgabe in Strings",
  note = 	"Vortrag bei der Forth-Tagung 2011",
  URL =  	"http://forth-ev.de/filemgmt/visit.php?lid=351",
  year = 	"2011",
}
@InProceedings{ertl&kuehling10,
  author =	"M. Anton Ertl and David K{\"u}hling",
  title =	"\texttt{ABI-CODE}: Increasing the portability of
		 assembly language words",
  booktitle =	"26th EuroForth Conference",
  year = 	"2010",
  pages =	"5--14",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef10/papers/ertl.pdf",
  OPTnote =	"refereed",
  abstract =	"Code words are not portable between Forth systems,
		 even on the same architecture; worse, in the case of
		 Gforth, they are not even portable between different
		 engines nor between different installations. We propose
		 a new mechanism for interfacing to assembly code:
		 \texttt{abi-code} words are written to comply with the
		 calling conventions (ABI) of the target platform, which
		 does not change between Forth systems. In the trade-off
		 between performance and portability, \texttt{abi-code}
		 provides a new option between code words and colon
		 definitions. Compared to \texttt{code} words, the
		 \texttt{abi-code} mechanism incurs an overhead of 16
		 instructions on AMD64. Compared to colon definitions,
		 we achieved a speedup by a factor of 1.27 on an
		 application by rewriting one short colon definition as
		 an abi-code word.",
}
@InProceedings{wodni10,
  author =	"Gerald Wodni and M. Anton Ertl",
  title =	"The Forth Net",
  booktitle =	"26th EuroForth Conference",
  year = 	"2010",
  pages =	"68--69",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef10/papers/wodni.pdf",
  OPTnote =	"not refereed",
  abstract =	"CPAN and PECL are impressive ways of sharing custom
		 libraries. Projects are discussed, hosted and
		 downloaded. Their dependencies are clear (no need to
		 search across the web) and also downloaded at once.
		 There is no such web portal for Forth --- until now.",
}
@Unpublished{ertl09ft1,
  author =	"M. Anton Ertl",
  title =	"Neuigkeiten in Gforth 0.7.0",
  note = 	"Talk given at Forth-Tagung 2009",
  year = 	"2009",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl09ft1.pdf",
}
@Unpublished{ertl09ft2,
  author =	"Gerald Wodni and M. Anton Ertl",
  title =	"Neuigkeiten seit Gforth 0.7.0",
  note = 	"Talk given at Forth-Tagung 2009",
  year = 	"2009",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl09ft2.pdf",
}
@Article{ertl09vd3a,
  author =	"M. Anton Ertl",
  title =	"Gforth~0.7.0",
  journal =	"Vierte Dimension",
  year = 	"2009",
  volume =	"25",
  number =	"3",
  pages =	"13--14",
  OPTnote =	"not refereed",
}
@Article{ertl09vd3b,
  author =	"M. Anton Ertl",
  title =	"Forth200x --- Berichte von den
		 Standardisierungstreffen",
  journal =	"Vierte Dimension",
  year = 	"2009",
  volume =	"25",
  number =	"3",
  pages =	"22",
  OPTnote =	"not refereed",
}
@Unpublished{ertl09autrans,
  author =	"M. Anton Ertl",
  title =	"Domination-Based Scoping and Static Single Assignment
		 Languages",
  note = 	"Talk given at Static Single-Assignment Form Seminar in
		 Autrans, France",
  month =	apr,
  year = 	"2009",
  slidesurl =	"http://www.prog.uni-saarland.de/ssasem/talks/Anton.Ertl.pdf",
  abstract =	"SSA form makes data flow easier to understand. This is
		 not just an advantage for the compiler, but can also be
		 an advantage for the programmer. One style of using
		 local variables in Forth is to program in
		 single-assignment style. Based on these experiences, we
		 also designed an experimental programming language with
		 a more conventional syntax that enforces static single
		 assignments, and collected some experience with it. One
		 consequence of this approach is a new scoping regime
		 based on domination; in many cases domination-based
		 scoping is mostly equvalent to classical block-based
		 scoping, but there are also cases where there is a
		 difference.",
}
@InProceedings{ertl09kps,
  author =	"M. Anton Ertl",
  title =	"Utilizing Multiple Hardware Threads with Pipeline
		 Parallelism",
  year = 	"2009",
  booktitle =	"{15. Kolloquium Programmiersprachen und Grundlagen der
		 Programmierung (KPS '09)}",
  editor =	"Jens Knoop and Adrian Prantl",
  pages =	"69--75",
  URL =  	"http://www.complang.tuwien.ac.at/kps09/pdfs/ertl.pdf",
  slidesurl =	"http://www.complang.tuwien.ac.at/kps09/pdfs/vortraege/kps09_ertl.pdf",
  abstract =	"In recent years general-purpose CPUs have aquired
		 multiple hardware threads by providing multiple cores
		 per CPU (multi-core) and multiple hardware threads per
		 core (simultaneous multi-threading). Making good use of
		 such resources has been a challenge for several decades
		 that has been successfully attacked for scientific
		 applications, but not to a significant extent for
		 general-purpose applications. The main current paradigm
		 for this programming problem is to have explicit
		 threads that share memory and are synchronized by a
		 variety of synchronzation constructs. Unfortunately, it
		 seems to be too hard to program profitably in this
		 paradigm for general-purpose applications. Pipeline
		 parallelism is a programming paradigm that has proved
		 so easy to understand that shell programmers use it
		 even on machines that have only one hardware thread. In
		 this work we present the case for better support for
		 pipeline parallelism in programming languages, and
		 present ideas on how to improve the scalability of the
		 implementation of pipeline parallelism.",
}
@InProceedings{ertl09euroforth,
  author =	"M. Anton Ertl",
  title =	"A Look at {G}forth Performance",
  booktitle =	"25th EuroForth Conference",
  year = 	"2009",
  pages =	"23--31",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef09/papers/ertl.pdf",
  slidesurl =	"http://www.complang.tuwien.ac.at/anton/euroforth/ef09/papers/ertl-slides.pdf",
  OPTnote =	"not refereed",
  abstract =	"Gforth used to be an traditional threaded-code system.
		 In the last decade we integrated a number of
		 performance features into Gforth. Several of them were
		 evaluated individually, but an evaluation with a more
		 global perspective has been missing until now. This
		 paper fills this void: We have measured the performance
		 of Gforth releases from 0.5.0 to 0.7.0, on a wide
		 variety of machines, and employing a wide variety of
		 GCC versions for compiling Gforth. We present that data
		 and give explanations for the performance
		 differences.",
}
@Article{shi+08,
  author =	"Yunhe Shi and Kevin Casey and M. Anton Ertl and David
		 Gregg",
  title =	"Virtual machine showdown: Stack versus registers",
  journal =	"ACM Transactions on Architecture and Code Optimization
		 (TACO)",
  year = 	"2008",
  volume =	"4",
  number =	"4",
  pages =	"21:1--21:36",
  month =	jan,
  URL =  	"http://doi.acm.org/10.1145/1328195.1328197",
  abstract =	"Virtual machines (VMs) enable the distribution of
		 programs in an architecture-neutral format, which can
		 easily be interpreted or compiled. A long-running
		 question in the design of VMs is whether a stack
		 architecture or register architecture can be
		 implemented more efficiently with an interpreter. We
		 extend existing work on comparing virtual stack and
		 virtual register architectures in three ways. First,
		 our translation from stack to register code and
		 optimization are much more sophisticated. The result is
		 that we eliminate an average of more than 46\% of
		 executed VM instructions, with the bytecode size of the
		 register machine being only 26\% larger than that of
		 the corresponding stack one. Second, we present a fully
		 functional virtual-register implementation of the Java
		 virtual machine (JVM), which supports Intel, AMD64,
		 PowerPC and Alpha processors. This register VM supports
		 inline-threaded, direct-threaded, token-threaded, and
		 switch dispatch. Third, we present experimental results
		 on a range of additional optimizations such as register
		 allocation and elimination of redundant heap loads. On
		 the AMD64 architecture the register machine using
		 switch dispatch achieves an average speedup of 1.48
		 over the corresponding stack machine. Even using the
		 more efficient inline-threaded dispatch, the register
		 VM achieves a speedup of 1.15 over the equivalent
		 stack-based VM.",
}
@InProceedings{ertl08euroforth,
  author =	"M. Anton Ertl",
  title =	"Cleaning up After Yourself",
  booktitle =	"24th EuroForth Conference",
  year = 	"2008",
  pages =	"35--38",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth/ef08/papers/ertl.pdf",
  psurl =	"http://www.complang.tuwien.ac.at/papers/ertl08euroforth.ps.gz",
  OPTnote =	"not refereed",
  abstract =	"Performing cleanup actions such as restoring a
		 variable or closing a file used to be impossible to
		 guarantee in Forth before Forth-94 gave us
		 \code{catch}. Even with \code{catch}, the cleanup code
		 can be skipped due to user interrupts if you are
		 unlucky. We introduce a construct that guarantees that
		 the cleanup code is always completed. We also discuss a
		 cheaper implementation approach for cleanup code than
		 using a full exception frame.",
}
@Unpublished{ertl08ft,
  author =	"M. Anton Ertl",
  title =	"Die Multicore-Herausforderung",
  note = 	"Talk given at Forth-Tagung 2008",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl08ft.pdf",
  year = 	"2008",
}
@Unpublished{ertl08dagstuhl,
  author =	"M. Anton Ertl",
  title =	"Using C for the Back End",
  note = 	"Talk given at Dagstuhl Seminar on Emerging Uses and
		 Paradigms for Dynamic Binary Translation (08441)",
  year = 	"2008",
  slides-url =	"http://www.dagstuhl.de/Materials/Files/08/08441/08441.ErtlAnton.Slides.pdf",
  abstract =	"If we want to implement a translator easily and
		 portably, but with with good code quality, then
		 translating through C is a good option. While C is a
		 static language, we can also use this technique for a
		 dynamic translator with the help of a dynamic linker.
		 The disadvantage of this approach is the large startup
		 time; this disadvantage can be partially overcome by
		 caching, batching, and seeding the cache. Some
		 challenges for this technique in binary translation are
		 modeling the (arbitrary) control flow in C and the
		 compilation granularity of C. One example of using
		 dynamic translation through C, although not in a binary
		 translation context is the implementation of a foreign
		 function interface.",
}
@InProceedings{ertl07euroforth,
  author =	"M. Anton Ertl",
  title =	"Gforth's libcc {C} Function Call Interface",
  booktitle =	"23rd EuroForth Conference",
  year = 	"2007",
  editor =	"M. Anton Ertl",
  pages =	"7--11",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl07euroforth.ps.gz",
  pdfurl =	"http://www.complang.tuwien.ac.at/anton/euroforth2007/papers/ertl.pdf",
  OPTnote =	"not refereed",
  abstract =	"A major problem in our earlier proposal for a C
		 interface was that a part of the interface was not
		 portable between platforms. The libcc interface solves
		 this problem by using a C compiler and its
		 \code{.h}-files. The \code{.h}-files contain knowledge
		 about the specific platform, and the C compiler
		 automatically inserts the necessary conversions between
		 Forth and C types. In this paper we describe the libcc
		 implementation and interface. We also discuss how a
		 Forth-C interface might be standardized.",
}
@Proceedings{euroforth07,
  title =	"23rd EuroForth Conference",
  booktitle =	"23rd EuroForth Conference",
  year = 	"2007",
  key =  	"EuroForth'07",
  editor =	"M. Anton Ertl",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth2007/papers/proceedings.pdf",
}
@Article{casey+07toplas,
  author =	"Kevin Casey and M. Anton Ertl and David Gregg",
  title =	"Optimizing indirect branch prediction accuracy in
		 virtual machine interpreters",
  journal =	"ACM Transactions on Programming Languages and
		 Systems",
  year = 	"2007",
  volume =	"29",
  number =	"6",
  pages =	"37:1--37:36",
  month =	oct,
  URL =  	"http://doi.acm.org/10.1145/1286821.1286828",
  abstract =	"Interpreters designed for efficiency execute a huge
		 number of indirect branches and can spend more than
		 half of the execution time in indirect branch
		 mispredictions. Branch target buffers (BTBs) are the
		 most widely available form of indirect branch
		 prediction; however, their prediction accuracy for
		 existing interpreters is only 2\%--50\%. In this
		 article we investigate two methods for improving the
		 prediction accuracy of BTBs for interpreters:
		 replicating virtual machine (VM) instructions and
		 combining sequences of VM instructions into
		 superinstructions. We investigate static (interpreter
		 build-time) and dynamic (interpreter runtime) variants
		 of these techniques and compare them and several
		 combinations of these techniques. To show their
		 generality, we have implemented these optimizations in
		 VMs for both Java and Forth. These techniques can
		 eliminate nearly all of the dispatch branch
		 mispredictions, and have other benefits, resulting in
		 speedups by a factor of up to 4.55 over efficient
		 threaded-code interpreters, and speedups by a factor of
		 up to 1.34 over techniques relying on dynamic
		 superinstructions alone.",
}
@Article{ertl07vd1,
  author =	"M. Anton Ertl",
  title =	"Forth 200x -- Treffen auf der EuroForth 2006",
  journal =	"Vierte Dimension",
  year = 	"2007",
  volume =	"23",
  number =	"1",
  pages =	"34",
  URL =  	"http://www.forth-ev.de/filemgmt/visit.php?lid=133",
  OPTnote =	"not refereed",
}
@Article{ertl07vd2a,
  author =	"M. Anton Ertl",
  title =	"Factor, Postscript, und Forth: Ein kleiner Vergleich",
  journal =	"Vierte Dimension",
  year = 	"2007",
  volume =	"23",
  number =	"2",
  pages =	"10--12",
  OPTnote =	"not refereed",
}
@Article{ertl07vd2b,
  author =	"M. Anton Ertl",
  title =	"Der Forth-Stammbaum",
  journal =	"Vierte Dimension",
  year = 	"2007",
  volume =	"23",
  number =	"2",
  pages =	"15--18",
  OPTnote =	"not refereed",
}
@Article{ertl06vd,
  author =	"M. Anton Ertl",
  title =	"Bericht von der EuroForth 2005",
  journal =	"Vierte Dimension",
  year = 	"2006",
  volume =	"22",
  number =	"1",
  pages =	"27",
  month =	jan,
  OPTnote =	"not refereed",
}
@Article{ertl06vd2,
  author =	"M. Anton Ertl",
  title =	"Ank{\"u}ndigung EuroForth 2006",
  journal =	"Vierte Dimension",
  year = 	"2006",
  volume =	"22",
  number =	"2",
  pages =	"8",
  month =	apr,
  OPTnote =	"not refereed",
}
@InProceedings{burgstaller+06,
  author =	"Bernd Burgstaller and Bernhard Scholz and Anton Ertl",
  title =	"An Embedded Systems Programming Environment for {C}",
  booktitle =	"Euro-Par 2006",
  pages =	"1204--1216",
  year = 	"2006",
  volume =	"4128",
  series =	"LNCS",
  publisher =	"Springer",
  ISBN = 	"3-540-37783-2",
  isbn13 =	"978-3-540-37783-2",
  abstract =	"Resource constraints are a major concern with the
		 design, development, and deployment of embedded
		 systems. Embedded systems are highly hardware-dependent
		 and have little computational power. Mobile embedded
		 systems are further constrained by their limited
		 battery capacity. Many of these systems are still
		 programmed in assembly language because there is a lack
		 of efficient programming environments.\par To overcome
		 or at least alleviate the restrictions, we propose a
		 light-weight and versatile programming environment for
		 the C programming language that offers mixed-mode
		 execution, i.e., code is either executed on the CPU or
		 on a virtual machine (VM). This mixed-mode execution
		 environment combines the advantages of highly
		 compressed bytecode with the speed of machine code.\par
		 We have implemented the programming environment and
		 conducted experiments for selected programs of the
		 MiBench suite and the Spec~2000. The VM has a footprint
		 of 12~KB on the Intel~IA32. Initial results show that
		 the performance of the virtual machine is typically
		 only 2 to 36 times slower than the binary execution,
		 with compressed code occupying only 36\%--57\% of the
		 machine code size. Combining sequences of VM
		 instructions into new VM instructions
		 (superinstructions) increases the execution speed and
		 reduces the VM code size. Preliminary experiments
		 indicate a speedup by a factor of 3.",
}
@Unpublished{ertl06forth-tagung1,
  author =	"M. Anton Ertl",
  title =	"Forth 200x",
  note = 	"Talk given at Forth-Tagung 2006 in Witten",
  month =	may,
  year = 	"2006",
  URL =  	"http://www.forth200x.org/",
}
@Article{ertl+06dotnet,
  author =	"M. Anton Ertl and Christian Thalinger and Andreas
		 Krall",
  title =	"Superinstructions and Replication in the {Cacao} {JVM}
		 interpreter",
  journal =	"Journal of .NET Technologies",
  year = 	"2006",
  volume =	"4",
  pages =	"25--32",
  note = 	"Journal papers from \emph{{.NET} Technologies 2006}
		 conference",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl+06dotnet.ps.gz",
  issueurl =	"http://dotnet.zcu.cz/NET_2006/Papers_2006/2006_Vol_4.pdf",
  ISBN = 	"80-86943-13-5",
  ISSN = 	"1801-2108",
  abstract =	"Dynamic superinstructions and replication can provide
		 large speedups over plain interpretation. In a JVM
		 implementation we have to overcome two problems to
		 realize the full potential of these optimizations: the
		 conflict between superinstructions and the quickening
		 optimization; and the non-relocatability of JVM
		 instructions that can throw exceptions. In this paper,
		 we present solutions for these problems. We also
		 present empirical results: We see speedups of up to a
		 factor of 4 on SpecJVM98 benchmarks from
		 superinstructions with all these problems solved. The
		 contribution of making potentially throwing JVM
		 instructions relocatable is up to a factor of 2. A
		 simple way of dealing with quickening instructions is
		 good enough, if superinstructions are generated in JIT
		 style. Replication has little effect on performance.",
}
@Unpublished{ertl+06linkoeping1,
  author =	"M. Anton Ertl",
  title =	"Fast and Flexible Instruction Selection with On-Demand
		 Tree-Parsing Automata",
  note = 	"Talk given at \emph{Mini-Workshop on Code Generation}
		 in Link{\"o}ping, June 7--8",
  year = 	"2006",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl+06pldi.ps.gz",
}
@Unpublished{ertl+06linkoeping2,
  author =	"M. Anton Ertl",
  title =	"Superinstructions and Replication in the {Cacao} {JVM}
		 interpreter",
  note = 	"Talk given at \emph{Mini-Workshop on Code Generation}
		 in Link{\"o}ping, June 7--8",
  year = 	"2006",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl+06dotnet.ps.gz",
}
@InProceedings{ertl+06pldi,
  author =	"M. Anton Ertl and Kevin Casey and David Gregg",
  title =	"Fast and Flexible Instruction Selection with On-Demand
		 Tree-Parsing Automata",
  booktitle =	"SIGPLAN Conference on Programming Language Design and
		 Implementation (PLDI '06)",
  ISBN = 	"1-59593-320-4",
  pages =	"52--60",
  year = 	"2006",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl+06pldi.ps.gz",
  abstract =	"Tree parsing as supported by code generator generators
		 like BEG, burg, iburg, lburg and ml-burg is a popular
		 instruction selection method. There are two existing
		 approaches for implementing tree parsing: dynamic
		 programming, and tree-parsing automata; each approach
		 has its advantages and disadvantages. We propose a new
		 implementation approach that combines the advantages of
		 both existing approaches: we start out with dynamic
		 programming at compile time, but at every step we
		 generate a state for a tree-parsing automaton, which is
		 used the next time a tree matching the state is found,
		 turning the instruction selector into a fast
		 tree-parsing automaton. We have implemented this
		 approach in the Gforth code generator. The
		 implementation required little effort and reduced the
		 startup time of Gforth by up to a factor of 2.5.",
}
@InProceedings{ertl06,
  author =	"M. Anton Ertl",
  title =	"A Portable {C} Function Call Interface",
  booktitle =	"22nd EuroForth Conference",
  year = 	"2006",
  editor =	"M. Anton Ertl",
  pages =	"47--51",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl06.ps.gz",
  pdfurl =	"http://www.complang.tuwien.ac.at/anton/euroforth2006/papers/ertl.pdf",
  OPTnote =	"not refereed",
  abstract =	"Many Forth systems provide means to call C functions,
		 but these interfaces are not designed to be portable
		 between platforms: A call to a C library function that
		 works on one platform may fail on the next platform,
		 because the parameter and return value types of the C
		 function may be different. In this paper, we present an
		 interface that avoids this problem: In particular, the
		 actual calls can be made platform-independent; a part
		 of the declarations is platform-dependent, but can be
		 generated automatically from C .h-files.",
}
@Article{ertl06vd4,
  author =	"M. Anton Ertl",
  title =	"Einladung zur Forth-Tagung 2007",
  journal =	"Vierte Dimension",
  year = 	"2006",
  volume =	"22",
  number =	"4",
  pages =	"36",
  month =	dec,
  OPTnote =	"not refereed",
}
@InProceedings{ertl&gregg05,
  author =	"M. Anton Ertl and David Gregg",
  title =	"Stack Caching in {Forth}",
  booktitle =	"21st EuroForth Conference",
  year = 	"2005",
  editor =	"M. Anton Ertl",
  pages =	"6--15",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg05.ps.gz",
  pdfurl =	"http://www.complang.tuwien.ac.at/anton/euroforth2005/papers/ertl%26gregg05.pdf",
  OPTnote =	"not refereed",
  abstract =	"Stack caching speeds Forth up by keeping stack items
		 in registers, reducing the number of memory accesses
		 for stack items. This paper describes our work on
		 extending Gforth's stack caching implementation to
		 support more than one register in the canonical state,
		 and presents timing results for the resulting Forth
		 system. For single-representation stack caches, keeping
		 just one stack item in registers is usually best, and
		 provides speedups up to a factor of 2.84 over the
		 straight-forward stack representation. For stack caches
		 with multiple stack representations, using the
		 one-register representation as canonical representation
		 is usually optimal, resulting in an overall speedup of
		 up to a factor of 3.80 (and up to a factor of 1.53 over
		 single-representation stack caching).",
}
@InProceedings{ertl&paysan05,
  author =	"M. Anton Ertl and Bernd Paysan",
  title =	"Xchars or {Unicode} in {Forth}",
  booktitle =	"21st EuroForth Conference",
  year = 	"2005",
  editor =	"M. Anton Ertl",
  pages =	"16--20",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26paysan05.ps.gz",
  pdfurl =	"http://www.complang.tuwien.ac.at/anton/euroforth2005/papers/ertl%26paysan05.pdf",
  OPTnote =	"not refereed",
  abstract =	"When dealing with different scripts at the same time
		 (e.g., Latin, Greek, Cyrillic), or with Chinese
		 ideograms, 8-bit fixed-width characters are too narrow.
		 However, many Forth programs have an environmental
		 dependency on $\code{1 chars}=1$, so just making Forth
		 characters wider would cause quite a lot of portability
		 problems. We propose to add xchars for dealing with
		 potentially wider, variable-width characters. This
		 extension is relatively painless, requiring changes in
		 only those program parts that work with individual
		 characters, if they should work with the extended
		 characters; uses of string words need no changes to
		 work with extended characters. The xchar words can also
		 be implemented on 8-bit-only Forth systems, so programs
		 written to use xchars can also work on such systems.",
}
@InProceedings{shi+05,
  author =	"Yunhe Shi and David Gregg and Andrew Beatty and M.
		 Anton Ertl",
  title =	"Virtual Machine Showdown: Stack Versus Registers",
  booktitle =	"Virtual Execution Environments (VEE '05)",
  pages =	"153--163",
  year = 	"2005",
  URL =  	"http://www.cs.tcd.ie/David.Gregg/papers/vee05-ShiGreggBeattyErtl.pdf",
}
@Article{ertl05ivme-editorial,
  author =	"M. Anton Ertl",
  title =	"Advances in Interpreters, Virtual Machines, and
		 Emulators",
  journal =	"Science of Computer Programming",
  year = 	"2005",
  volume =	"57",
  number =	"3",
  pages =	"251--252",
  month =	sep,
  note = 	"Editorial for special issue with journal versions of
		 the IVME'03 papers.",
}
@InProceedings{ertl&gregg04ivme,
  author =	"M. Anton Ertl and David Gregg",
  title =	"Combining Stack Caching with Dynamic
		 Superinstructions",
  booktitle =	"Interpreters, Virtual Machines and Emulators (IVME
		 '04)",
  year = 	"2004",
  pages =	"7--14",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg04ivme.ps.gz",
  abstract =	"Dynamic superinstructions eliminate most of the
		 interpreter dispatch overhead. This results in a higher
		 proportion of interpreter time spent in stack accesses
		 (on a stack-based virtual machine). Stack caching
		 reduces the stack access overhead. Each of these
		 optimizations provides more speedup, if the other one
		 is applied, too. Combining these optimizations also
		 opens additional opportunities: we can insert stack
		 state transitions without dispatch cost; this reduces
		 the number of necessary VM instruction instances
		 significantly. A shortest-path search can find the
		 optimal sequence of state transitions and VM
		 instructions. In this paper we describe an
		 implementation of static stack caching employing these
		 ideas. We also represent empirical results for our
		 implementation, resulting in a speedup of up to 58\%
		 over a version that keeps one value in registers all
		 the time.",
}
@InProceedings{ertl&gregg04pact,
  author =	"M. Anton Ertl and David Gregg",
  title =	"Retargeting {JIT} compilers by using {C}-compiler
		 generated executable code",
  booktitle =	"Parallel Architecture and Compilation Techniques
		 (PACT' 04)",
  year = 	"2004",
  pages =	"41--50",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg04pact.ps.gz",
  abstract =	"JIT compilers produce fast code, whereas interpreters
		 are easy to port between architectures. We propose to
		 combine the advantages of these language implementation
		 techniques as follows: we generate native code by
		 concatenating and patching machine code fragments taken
		 from interpreter-derived code (generated by a C
		 compiler); we completely eliminate the interpreter
		 dispatch overhead and accesses to the interpreted code
		 by patching jump target addresses and other constants
		 into the fragments. In this paper we present the basic
		 idea, discuss some issues in more detail, and present
		 results from a proof-of-concept implementation,
		 providing speedups of up to 1.87 over the fastest
		 previous interpreter-based technique, and performance
		 comparable to simple native-code compilers. The effort
		 required for retargeting our implementation from the
		 386 to the PPC architecture was less than a
		 person-day.",
}
@InProceedings{ertl04euroforth,
  author =	"M. Anton Ertl",
  title =	"Forth Family Tree and Timeline",
  booktitle =	"EuroForth 2004 Conference Proceedings",
  pages =	"1--4",
  year = 	"2004",
  OPTnote =	"not refereed",
  URL =  	"http://www.complang.tuwien.ac.at/forth/family-tree/",
}
@InProceedings{gregg&ertl04euroforth,
  author =	"David Gregg and M. Anton Ertl",
  title =	"Inlining in {Gforth}: Early Experiences",
  booktitle =	"EuroForth 2004 Conference Proceedings",
  pages =	"33--40",
  year = 	"2004",
  URL =  	"http://www.complang.tuwien.ac.at/papers/gregg%26ertl04euroforth.ps.gz",
  OPTnote =	"not refereed",
  abstract =	"Many optimizations are easier or more effective for
		 straight-line code (basic blocks). Straight-line code
		 in Forth is limited mainly by calls and returns.
		 Inlining eliminates calls and returns, which in turn
		 makes the basic blocks longer, and increases the
		 effectiveness of other optimizations. In this paper we
		 present a first prototype implementation of ininlining
		 for Gforth.",
}
@InCollection{gregg&ertl04dspg,
  author =	"David Gregg and M. Anton Ertl",
  title =	"A Language and Tool for Generating Efficient Virtual
		 Machine Interpreters",
  booktitle =	"Domain-Specific Program Generation",
  pages =	"196--215",
  publisher =	"Springer",
  year = 	"2004",
  series =	"LNCS 3016",
}
@Article{ertl03vd,
  author =	"M. Anton Ertl",
  title =	"Threaded Code -- Varianten und Optimierungen
		 (Kurzfassung)",
  journal =	"Vierte Dimension -- Das FORTH-Magazin",
  year = 	"2003",
  volume =	"19",
  number =	"1",
  pages =	"12--15",
  annote =	"A shortened, German version of \cite{ertl02}",
}
@Unpublished{ertl03dagstuhl,
  author =	"M. Anton Ertl",
  title =	"Optimizing Interpreters",
  note = 	"Talk given at the Dagstuhl Seminar 3071: Emerging
		 Technologies: Can Optimization Technology meet their
		 Demands?",
  year = 	"2003",
  month =	feb,
}
@InProceedings{ertl&gregg03,
  author =	"M. Anton Ertl and David Gregg",
  title =	"Optimizing Indirect Branch Prediction Accuracy in
		 Virtual Machine Interpreters",
  booktitle =	"SIGPLAN Conference on Programming Language Design and
		 Implementation (PLDI'03)",
  year = 	"2003",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg03.ps.gz",
  abstract =	"Interpreters designed for efficiency execute a huge
		 number of indirect branches and can spend more than
		 half of the execution time in indirect branch
		 mispredictions. Branch target buffers are the best
		 widely available\mn{on all recent general-purpose
		 machines?} form of indirect branch prediction; however,
		 their prediction accuracy for existing interpretes is
		 only 2\%--50\%. In this paper we investigate two
		 methods for improving the prediction accuracy of BTBs
		 for interpreters: replicating virtual machine (VM)
		 instructions and combining sequences of VM instructions
		 into superinstructions. We investigate static
		 (interpreter build-time) and dynamic (interpreter
		 run-time) variants of these techniques and compare them
		 and several combinations of these techniques. These
		 techniques can eliminate nearly all of the dispatch
		 branch mispredictions, and have other benefits,
		 resulting in speedups by a factor of up to 3.17 over
		 efficient threaded-code interpreters, and speedups by a
		 factor of up to 1.3 over techniques relying on
		 superinstructions alone.",
}
@Proceedings{ivme03,
  title =	"Interpreters, Virtual Machines and Emulators
		 (IVME~'03)",
  booktitle =	"Interpreters, Virtual Machines and Emulators
		 (IVME~'03)",
  year = 	"2003",
  key =  	"IVME~'03",
  editor =	"M. Anton Ertl",
  URL =  	"http://www.complang.tuwien.ac.at/anton/ivme03/proceedings/ivme.ps.gz",
  url2 = 	"http://portal.acm.org/toc.cfm?id=858570&type=proceeding",
}
@InProceedings{ertl&gregg03euroforth,
  author =	"M. Anton Ertl and David Gregg",
  title =	"Implementation Issues for Superinstructions in
		 {Gforth}",
  booktitle =	"EuroForth 2003 Conference Proceedings",
  year = 	"2003",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg03euroforth.ps.gz",
  abstract =	"Combining Forth primitives into superinstructions
		 provides nice speedups. Several approaches to
		 superinstructions were explored in the Gforth project.
		 This paper discusses the effects of these approaches on
		 performance, compilation time, implementation effort,
		 and on programming tools such as the decompiler and
		 backtracing.",
}
@Article{ertl&gregg03jilp,
  author =	"M. Anton Ertl and David Gregg",
  title =	"The Structure and Performance of \emph{Efficient}
		 Interpreters",
  journal =	"The Journal of Instruction-Level Parallelism",
  year = 	"2003",
  volume =	"5",
  month =	nov,
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg03jilp.ps.gz",
  url2 = 	"http://www.jilp.org/vol5/v5paper12.pdf",
  note = 	"http://www.jilp.org/vol5/",
  abstract =	"Interpreters designed for high general-purpose
		 performance typically perform a large number of
		 indirect branches (3.2\%--13\% of all executed
		 instructions in our benchmarks). These branches consume
		 more than half of the run-time in a number of
		 configurations we simulated. We evaluate how accurate
		 various existing and proposed branch prediction schemes
		 are on a number of interpreters, how the mispredictions
		 affect the performance of the interpreters and how two
		 different interpreter implementation techniques perform
		 with various branch predictors. We also suggest various
		 ways in which hardware designers, C compiler writers,
		 and interpreter writers can improve the performance of
		 interpreters.",
}
@InProceedings{casey+03,
  author =	"Kevin Casey and David Gregg and M. Anton Ertl and
		 Andrew Nisbeth",
  title =	"Towards Superinstructions for Java Interpreters",
  booktitle =	"Software and Compilers for Embedded Systems (SCOPES
		 2003)",
  pages =	"329--343",
  year = 	"2003",
  volume =	"2826",
  series =	"LNCS",
  publisher =	"Springer",
}
@Manual{gforth03,
  title =	"Gforth (Manual)",
  author =	"Neal Crook and Anton Ertl and David Kuehling and Bernd
		 Paysan and Jens Wilke",
  organization = "Free Software Foundation",
  edition =	"0.6.2",
  year = 	"2003",
}
@Article{ertl+02,
  author =	"M. Anton Ertl and David Gregg and Andreas Krall and
		 Bernd Paysan",
  title =	"\textsf{vmgen} --- A Generator of Efficient Virtual
		 Machine Interpreters",
  journal =	"Software---Practice and Experience",
  year = 	"2002",
  volume =	"32",
  number =	"3",
  pages =	"265--294",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl+02.ps.gz",
  abstract-url = "http://www3.interscience.wiley.com/cgi-bin/abstract/90010508/START",
  keywords =	"interpreter; virtual machine; generator; stack
		 architecture; superinstruction; byte code",
  abstract =	"In a virtual machine interpreter, the code for each
		 virtual machine instruction has similarities to code
		 for other instructions. We present an interpreter
		 generator that takes simple virtual machine instruction
		 descriptions as input and generates C code for
		 processing the instructions in several ways: execution,
		 virtual machine code generation, disassembly, tracing,
		 and profiling. The generator is designed to support
		 efficient interpreters: it supports threaded code,
		 caching the top-of-stack item in a register, combining
		 simple instructions into superinstructions, and other
		 optimizations. We have used the generator to create
		 interpreters for Forth and Java. The resulting
		 interpreters are faster than other interpreters for the
		 same languages and they are typically 2-10 times slower
		 than code produced by native-code compilers. We also
		 present results for the effects of the individual
		 optimizations supported by the generator.",
}
@InProceedings{ertl&gregg02,
  author =	"M. Anton Ertl and David Gregg",
  title =	"Building an interpreter with \textsf{vmgen}",
  booktitle =	"Compiler Construction (CC'02)",
  pages =	"5--8",
  year = 	"2002",
  publisher =	"Springer LNCS~2304",
  note = 	"Tool Demonstration",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg02.ps.gz",
  abstract =	"\textsf{Vmgen} automates many of the tasks of writing
		 the virtual machine part of an interpreter, resulting
		 in less coding, debugging and maintenance effort. This
		 paper gives some quantitative data about the source
		 code and generated code for a \textsf{vmgen}-based
		 interpreter, and gives some examples demonstrating the
		 simplicity of using \textsf{vmgen}.",
}
@InProceedings{ertl02,
  author =	"M. Anton Ertl",
  title =	"Threaded Code Variations and Optimizations (Extended
		 Version)",
  booktitle =	"Forth-Tagung 2002",
  year = 	"2002",
  address =	"Garmisch-Partenkirchen",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl02.ps.gz",
  abstract =	"Forth has been traditionally implemented as indirect
		 threaded code, where the code for non-primitives is the
		 code-field address of the word. To get the maximum
		 benefit from combining sequences of primitives into
		 superinstructions, the code produced for a
		 non-primitive should be a primitive followed by a
		 parameter (e.g., \code{lit} \emph{addr} for variables).
		 This paper takes a look at the steps from a traditional
		 threaded-code implementation to superinstructions, and
		 at the size and speed effects of the various
		 steps.\comment{It also compares these variants of
		 Gforth to various other Forth implementations on
		 contemporary machines.} The use of superinstructions
		 gives speedups of up to a factor of 2 on large
		 benchmarks on processors with branch target buffers,
		 but requires more space for the primitives and the
		 optimization tables, and also a little more space for
		 the threaded code.",
}
@InProceedings{ertl02ef,
  author =	"M. Anton Ertl",
  title =	"The Evolution of Vmgen",
  booktitle =	"18th EuroForth Conference",
  year = 	"2002",
  editor =	"M. Anton Ertl",
  pages =	"33--37",
  URL =  	"http://www.complang.tuwien.ac.at/anton/euroforth2002/papers/ertl.ps.gz",
  note = 	"Slides",
}
@InProceedings{ertl02efb,
  author =	"M. Anton Ertl",
  title =	"Superinstructions in {Gforth}",
  booktitle =	"18th EuroForth Conference",
  year = 	"2002",
  editor =	"M. Anton Ertl",
  note = 	"Demonstration only, no paper",
}
@Proceedings{euroforth02,
  title =	"18th EuroForth Conference",
  booktitle =	"18th EuroForth Conference",
  year = 	"2002",
  key =  	"EuroForth'02",
  editor =	"M. Anton Ertl",
}
@InProceedings{gregg+01,
  author =	"David Gregg and M.~Anton Ertl and Andreas Krall",
  title =	"A Fast {Java} Interpreter",
  booktitle =	"JOSES Workshop at ETAPS'01",
  year = 	"2001",
  editor =	"Uwe Assmann",
  URL =  	"http://www.complang.tuwien.ac.at/papers/gregg+01.ps.gz",
  abstract =	"The Java virtual machine (JVM) is usually implemented
		 with an interpreter or just-in-time (JIT) compiler.
		 JITs provide the best performance, but must be
		 substantially rewritten for each architecture they are
		 ported to. Interpreters are easier to develop and
		 maintain, need less memory and can be ported to new
		 architectures with almost no changes. The weakness of
		 interpreters is that they are much slower than JITs.
		 This paper describes work in progress on faster Java
		 interpreters. Our goal is to bring interpreter
		 performance to a new higher level by developing new
		 optimisations for modern computer architectures, and by
		 adapting recent research on compilers to
		 interpreters.",
}
@InProceedings{ertl&gregg01,
  author =	"M.~Anton Ertl and David Gregg",
  title =	"The Behaviour of Efficient Virtual Machine
		 Interpreters on Modern Architectures",
  booktitle =	"Euro-Par 2001",
  pages =	"403--412",
  year = 	"2001",
  publisher =	"Springer LNCS~2150",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26gregg01.ps.gz",
  abstract =	"Romer et al (ASPLOS 96) examined several interpreters
		 and concluded that they behave much like general
		 purpose integer programs such as gcc. We show that
		 there is an important class of interpreters which
		 behave very differently. Efficient virtual machine
		 interpreters perform a large number of indirect
		 branches (3.2\%--13\% of all executed instructions in
		 our benchmarks, taking up to 61\%-79\% of the cycles on
		 a machine with no branch prediction). We evaluate how
		 various branch prediction schemes and methods to reduce
		 the mispredict penalty affect the performance of
		 several virtual machine interpreters. Our results show
		 that for current branch predictors, threaded code
		 interpreters cause fewer mispredictions, and are almost
		 twice as fast as switch based interpreters on modern
		 superscalar architectures.",
}
@InProceedings{gregg+01hpcn,
  author =	"David Gregg and M.~Anton Ertl and Andreas Krall",
  title =	"Implementing an Efficient {Java} Interpreter",
  booktitle =	"High-Performance Computing and Networking (HPCN Europe
		 2001)",
  pages =	"613--620",
  year = 	"2001",
  publisher =	"Springer LNCS~2110",
  URL =  	"http://www.complang.tuwien.ac.at/papers/gregg+01hpcn.ps.gz",
  pdf-url =	"http://link.springer-ny.com/link/service/series/0558/papers/2110/21100613.pdf",
  abstract =	"The Java virtual machine (JVM) is usually implemented
		 with an interpreter or just-in-time (JIT) compiler. JIT
		 compilers provide the best performance, but must be
		 substantially rewritten for each architecture they are
		 ported to. Interpreters are easier to develop and
		 maintain, and can be ported to new architectures with
		 almost no changes. The weakness of interpreters is that
		 they are much slower than JIT compilers. This paper
		 describes work in progress on a highly efficient Java
		 interpreter. We describe the main features that make
		 our interpreter efficient. Our initial experimental
		 results show that an interpreter-based JVM may be only
		 1.9 times slower than a compiler-based JVM for some
		 important applications.",
}
@InProceedings{ertl01,
  author =	"M. Anton Ertl",
  title =	"Threaded Code Variations and Optimizations",
  booktitle =	"EuroForth 2001 Conference Proceedings",
  pages =	"49--55",
  year = 	"2001",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl01.ps.gz",
  abstract =	"Forth has been traditionally implemented as indirect
		 threaded code, where the code for non-primitives is the
		 code-field address of the word. To get the maximum
		 benefit from combining sequences of primitives into
		 superinstructions, the code produced for a
		 non-primitive should be a primitive followed by a
		 parameter (e.g., \code{lit} \emph{addr} for variables).
		 This paper takes a look at the steps from a traditional
		 threaded-code implementation to superinstructions, and
		 at the size and speed effects of the various steps. The
		 use of superinstructions gives speedups of up to a
		 factor of 2 on large benchmarks on processors with
		 branch target buffers, but requires more space for the
		 primitives and the optimization tables, and also a
		 little more space for the threaded code.",
}
@InProceedings{gregg+01euroforth,
  author =	"David Gregg and M. Anton Ertl and John Waldron",
  title =	"The Common Case in {Forth} Programs",
  booktitle =	"EuroForth 2001 Conference Proceedings",
  pages =	"63--70",
  year = 	"2001",
  url1 = 	"http://www.cs.tcd.ie/David.Gregg/euroforth01.ps.gz",
  URL =  	"http://www.complang.tuwien.ac.at/papers/gregg+01euroforth.ps.gz",
  abstract =	"Identifying common features in Forth programs is
		 important for those designing Forth machines and
		 optimisers. In this paper we measure the behaviour of
		 six large Forth programs and four small ones. We look
		 at the ratio of user to system code, basic block
		 lengths, common instructions, and common sequences of
		 instructions. Our most important finding is that for
		 most large programs, many (38.4\%--47.6\% statically
		 and 21.8\%--40.9\% dynamically) basic blocks consist of
		 only a single instruction, which hinders optimisation.
		 We also show static measures of frequent instructions
		 and sequences of instructions are more consistent
		 across programs, and may be a better predictor of the
		 behaviour of other programs than dynamic measures.",
}
@Unpublished{ertl01vhs,
  author =	"M. Anton Ertl",
  title =	"Wie werden Computer schneller?",
  note = 	"Vortrag im Rahmen der VHS-Veranstaltungsreihe
		 ``University meets Public''",
  month =	feb,
  year = 	"2001",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl01vhs.ps.gz",
  abstract =	"Computerhardware wird regelmaessig schneller und
		 leistungsfaehiger. Dieser Vortrag beschreibt diese
		 Entwicklungen, insbesondere in Bezug auf den internen
		 Aufbau der Prozessoren, und ihre Auswirkungen auf die
		 Software.",
}
@Unpublished{ertl01state,
  author =	"M. Anton Ertl",
  title =	"STATE-smartness: Applications, Pitfalls,
		 Alternatives",
  note = 	"This paper is an updated and significantly revised
		 version of my EuroForth '98 paper \cite{ertl98}; I
		 submitted this draft version to JFAR in 2001, but it
		 was not processed, so I finally put it online.",
  year = 	"2001",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl01state.pdf",
  abstract =	"STATE-smart words provide a number of unpleasant
		 surprises to their users. They are applied in two
		 contexts, and they fail in both: 1) for implementing
		 words like \texttt{s"} that provide an arbitrary
		 combination of interpretation and compilation semantics
		 (combined words); 2) for optimizing using a special
		 implementation of the (default) compilation semantics.
		 This paper discusses these issues and shows programmers
		 and system implementors how to avoid STATE-smart words.
		 It also reports our experiences in converting the
		 STATE-smart words in Gforth into a clean solution:
		 little work and few problems.",
}
@Unpublished{ertl00dagstuhl,
  author =	"M. Anton Ertl",
  title =	"Optimization During Tree-Parsing Code Selection",
  note = 	"Talk given at the Dagstuhl Seminar on Code
		 Optimization",
  year = 	"2000",
  month =	sep,
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl00dagstuhl.ps.gz",
  abstract =	"Tree parsing is well-known as a method for code
		 selection. This talk presents a technique for also
		 using it for some optimizations. The basic principle is
		 to introduce additional nonterminals that correspond to
		 additional data representations. For example, a
		 nonterminal representing complemented values can be
		 used to distribute complement operations to the optimal
		 position (for a machine) using DeMorgan's laws. Other
		 examples are other unary operators, constant folding
		 across intervening non-constants, and optimizing the
		 conversion between different representations, such as
		 various flag representations, tagged and untagged
		 representations, or various data sizes. The advantages
		 of this technique are that it optimizes for the
		 machine, not some intermediate code metric and that it
		 has no compile-time overhead when used with tree
		 parsing automata. The limitations are that there are
		 only a finite number of nonterminals and thus data
		 representations, that optimality is limited to trees,
		 that otherwise it is limited to single-entry regions,
		 and that it results in large grammars. This talk also
		 mentions further work in factoring the grammar and in
		 dealing with DAGs.",
}
@InProceedings{ertl00,
  author =	"M. Anton Ertl",
  title =	"\texttt{CONST-DOES>}",
  booktitle =	"EuroForth 2000 Conference Proceedings",
  year = 	"2000",
  address =	"Prestbury, UK",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl00.ps.gz",
  abstract =	"A frequent use of the \code{create}...\code{does>}
		 construct is to provide some constants at definition
		 time that are then used at execution time (a
		 \code{constant}-style use). However,
		 \code{create}...\code{does>} also supports a
		 \code{value}-style use, where the data is not constant
		 and can change at any time. This additional
		 functionality inhibits optimization. This paper
		 proposes \code{const-does>}, which can only be used to
		 define \code{constant}-style words, and thus makes
		 optimization possible.",
}
@InProceedings{czezatke&ertl00,
  author =	"Christian Czezatke and M. Anton Ertl",
  title =	"{LinLogFS} --- A Log-Structured Filesystem For
		 {Linux}",
  booktitle =	"Freenix Track of Usenix Annual Technical Conference",
  pages =	"77--88",
  year = 	"2000",
  URL =  	"http://www.complang.tuwien.ac.at/papers/czezatke%26ertl00/",
  ps-url =	"http://www.complang.tuwien.ac.at/papers/czezatke%26ertl00.ps.gz",
  abstract =	"LinLogFS is a log-structured filesystem for Linux. It
		 currently offers features like fast crash recovery and
		 in-order write semantics. We implemented LinLogFS by
		 putting a logging layer between an adapted version of
		 the ext2 file system and the block device.",
}
@InProceedings{ertl99,
  author =	"M. Anton Ertl",
  title =	"Optimal Code Selection in {DAG}s",
  booktitle =	"Principles of Programming Languages (POPL '99)",
  year = 	"1999",
  OPTpages =	"242--249",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl99.ps.gz",
  abstract =	"We extend the tree parsing approach to code selection
		 to DAGs. In general, our extension does not produce the
		 optimal code selection for all DAGs (this problem would
		 be NP-complete), but for certain code selection
		 grammars, it does. We present a method for checking
		 whether a code selection grammar belongs to this set of
		 DAG-optimal grammars, and use this method to check code
		 selection grammars adapted from lcc: the grammars for
		 the MIPS and SPARC architectures are DAG-optimal, and
		 the code selection grammar for the 386 architecture is
		 almost DAG-optimal.",
}
@Unpublished{ertl+99,
  author =	"M. Anton Ertl and David Gregg and Andreas Krall",
  title =	"Optimal Global Instruction Scheduling with Unlimited
		 Resources",
  note = 	"http://www.complang.tuwien.ac.at/papers/ertl+99.ps.gz",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl+99.ps.gz",
  year = 	"1999",
  abstract =	"We present a method for optimal whole-procedure
		 instruction scheduling for machines with unlimited
		 resources: The program and its dependences are
		 transformed into a linear programming problem, which
		 can then be solved using an off-the-shelf linear
		 problem solver. This scheduler is an intermediate step
		 towards a more realistic global instruction scheduler,
		 but it has also an immediate use: We use it to evaluate
		 the significance of the restrictions imposed by static
		 scheduling and for determining an upper bound for the
		 performance of more realistic global instruction
		 schedulers. We have applied the scheduler to several
		 benchmarks and compared it to a dynamic scheduler with
		 unlimited resources. For some benchmarks, they perform
		 equally well; for others, dynamic scheduling performs
		 much better; on closer inspection it appears that the
		 causes for this performance difference can be reduced
		 by performing well-known transformations before
		 scheduling (in particular, loop transformations).",
}
@InProceedings{ertl99ef,
  author =	"M. Anton Ertl",
  title =	"Is {Forth} Code Compact? {A} Case Study",
  booktitle =	"EuroForth'99 Conference Proceedings",
  year = 	"1999",
  address =	"St. Petersburg, Russia",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl99ef.ps.gz",
  abstract =	"Forth advocates often claim that Forth code is
		 smaller, faster, and requires less development time
		 than equivalent programs in other languages. This paper
		 investigates this claim by comparing a number of parser
		 generators written in various languages with respect to
		 source code size. The smallest parser generator (14
		 lines) in this comparison is written in Forth, and the
		 other Forth program is smaller than the others in its
		 class by a factor of 8 or more; however, the Forth
		 programs do not have all the features of their
		 counterparts. I took a closer look at Gray (in Forth)
		 and Coco/R (in Modula-2) and found that several Forth
		 features missing from Modula-2 give Gray more than a
		 factor of three advantage over Coco/R (even if the
		 other size differences were solely due to differences
		 in functionality): run-time code generation; access to
		 the parser and a simple, flexible syntax; and Forth's
		 dictionary.",
}
@Unpublished{ertl99boulder,
  author =	"M. Anton Ertl",
  title =	"A fast compiler for a stack-based language",
  note = 	"Talk given at the University of Colorado in Boulder",
  year = 	"1999",
  month =	apr,
  abstract =	"I will present RAFTS, a Forth compiler, focussing on
		 aspects that are also useful in compilers for other
		 langauges.\parSpecific needs addressed by RAFTS are:
		 The source language relies on programmer-visible stacks
		 (like JavaVM and Postscript), it requires very fast
		 compilation (like e.g., Lisp, Prolog, Smalltalk, and
		 JavaVM) to remain interactive, and the source code
		 usually has a very high procedure call frequency (like,
		 e.g., Lisp, Prolog, ML).\par My presentation will give
		 a tour of the compiler, pointing out some of the
		 techniques that address these needs, e.g.: How convert
		 the stack-based code into data flow graphs, enabling
		 the application of standard code generation techniques;
		 how backwards scheduling simplifies the data structure
		 and saves two passes; how we minimized the number of
		 passes without sacrificing code quality; how a
		 threaded-code intermediate representation speeds up
		 code replication optimizations.",
}
@TechReport{ertl&pirker98,
  author =	"M. Anton Ertl and Christian Pirker",
  title =	"Compilation of Stack-Based Languages
		 ({Abschlu{\ss}bericht})",
  institution =  "Institut f{\"u}r Computersprachen, Technische
		 Universit{\"a}t Wien",
  year = 	"1998",
  type = 	"Final report to {FWF} for research project {P11231}",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26pirker98.ps.gz",
  abstract =	"RAFTS is a framework for applying state of the art
		 compiler technology to the compilation of stack-based
		 languages like Forth and Postscript. The special needs
		 of stack-based languages are an efficient stack
		 representation, fast procedure calls, and fast
		 compilation. RAFTS addresses the stack representation
		 problem by allocating stack items to registers such
		 that most stack accesses in the source program are
		 register accesses in the machine language program, and
		 by eliminating most stack pointer updates. To achieve
		 fast calls, RAFTS performs these optimizations
		 interprocedurally and also performs procedure inlining
		 and tail call optimization. Fast compilation is
		 achieved by selecting fast algorithms and implementing
		 them efficiently. Until now we have implemented the
		 basic block part of RAFTS and a part of the work
		 necessary for inlining and interprocedural
		 optimizations.",
}
@InProceedings{maierhofer&ertl98,
  author =	"Martin Maierhofer and M. Anton Ertl",
  title =	"Local Stack Allocation",
  booktitle =	"Compiler Construction (CC'98)",
  year = 	"1998",
  editor =	"Kai Koskimies",
  publisher =	"Springer LNCS~1383",
  address =	"Lisbon",
  pages =	"189--203",
  URL =  	"http://www.complang.tuwien.ac.at/papers/maierhofer%26ertl98.ps.gz",
  abstract =	"Considering the renewed interest in stack machines (in
		 particular, the Java Virtual Machine), efficient
		 execution of Algol-family languages on this class of
		 hardware becomes increasingly important. Local variable
		 accesses in the source language should be translated
		 into stack accesses on the target machine (in analogy
		 to register allocation on register machines).\par In
		 this paper we explore such stack allocation techniques
		 for basic blocks. We present some improvements to Phil
		 Koopman's \emph{stack scheduling}, add an instruction
		 scheduler and compare the result with an optimal stack
		 allocation and instruction scheduling strategy. Stack
		 scheduling in cooperation with depth first postorder
		 instruction scheduling produces results close to the
		 optimum. The optimizations discussed in this paper are
		 profitable only for stack hardware where stack
		 manipulation operations are faster than local variable
		 accesses.",
}
@InProceedings{krall+98,
  author =	"Andreas Krall and Anton Ertl and Michael Gschwind",
  title =	"{JavaVM} Implementation: Compilers versus Hardware",
  booktitle =	"Computer Architecture 98 (ACAC '98)",
  year = 	"1998",
  editor =	"John Morris",
  volume =	"20",
  series =	"Australian Computer Science Communications",
  publisher =	"Springer",
  address =	"Perth",
  pages =	"101--110",
  URL =  	"http://www.complang.tuwien.ac.at/papers/krall+98.ps.gz",
  abstract =	"The Java Virtual Machine (JavaVM) has contributed
		 greatly to Java's success because it provides a common
		 intermediate format which can be shared across the
		 Internet. Unfortunately, the JavaVM has been optimized
		 for an interpreted model, resulting in inferior
		 performance because its stack-based execution model
		 cannot exploit instruction-level parallelism. The
		 inherent serialization of the stack execution model can
		 be addressed either by using compilation techniques or
		 by hardware. In this article, we review the different
		 JavaVM implementation methods based on our experiences
		 with the implementation of the CACAO just-in-time
		 compiler. For comparison, we have also investigated
		 different hardware architectures for the direct
		 implementation of the JavaVM.",
}
@InProceedings{ertl98,
  author =	"M. Anton Ertl",
  title =	"\texttt{State}-smartness --- Why it is Evil and How to
		 Exorcise it",
  booktitle =	"EuroForth'98 Conference Proceedings",
  year = 	"1998",
  address =	"Schlo\ss{} Dagstuhl",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl98.ps.gz",
  abstract =	"\texttt{State}-smart words provide a number of
		 unpleasant surprises to their users. They are applied
		 in two contexts, and they fail in both: 1) for
		 providing an arbitrary combination of interpretation
		 and compilation semantics; 2) for optimizing with a
		 special implementation of the (default) compilation
		 semantics. This paper discusses these issues and shows
		 programmers and system implementors how to avoid
		 \texttt{state}-smart words. It also reports our
		 experiences in converting the \texttt{state}-smart
		 words in Gforth into a clean solution: little work and
		 few problems.",
}
@InProceedings{ertl97,
  author =	"M. Anton Ertl",
  title =	"{GRAY -- ein Generator f{\"u}r rekursiv absteigende
		 Ybersetzer}",
  booktitle =	"Forth-Tagung",
  year = 	"1997",
  address =	"Ludwigshafen",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl97.ps.gz",
  note = 	"In German",
  abstract =	"Der Parser-Generator Gray {\"u}bersetzt Grammatiken in
		 ausf{\"u}hrbaren Forth-Code f{\"u}r einen Parser. Als
		 Besonderheit ist dabei die problemlose Kombination
		 semantischen Aktionen und Erweiterungen der BNF zu
		 nennen, die sich aus der Verwendung des Stacks zur
		 Kommunikation zwischen den semantischen Aktionen
		 ergibt. Dieser Artikel beschreibt den Entwurf und die
		 Verwendung von Gray.",
}
@InProceedings{maierhofer&ertl97,
  author =	"Martin Maierhofer and M. Anton Ertl",
  title =	"Optimizing Stack Code",
  booktitle =	"Forth-Tagung",
  year = 	"1997",
  address =	"Ludwigshafen",
  URL =  	"http://www.complang.tuwien.ac.at/papers/maierhofer%26ertl97.ps.gz",
  abstract =	"With interest in stack machines recently growing (e.g.
		 the JavaVM architecture used by the Java programming
		 language), support for the efficient execution of
		 Algol-like high level languages on this class of
		 hardware becomes an issue. Local variables accesses in
		 the source language should be translated into stack
		 accesses of the target machine (similar to register
		 allocation on register machines).\par In this paper we
		 explore such stack allocation techniques for basic
		 blocks. We evaluate Phil Koopman's ``stack scheduling''
		 by adding an instruction scheduler and comparing the
		 result with an optimal stack allocation and instruction
		 scheduling strategy. Stack scheduling in cooperation
		 with a depth first postorder instruction scheduling
		 produces results close to the optimum. The
		 optimizations discussed in this paper are only useful
		 for stack hardware that allows faster access to the
		 stack than to main memory.",
}
@InProceedings{ertl&pirker97,
  author =	"M. Anton Ertl and Christian Pirker",
  title =	"The Structure of a {Forth} Native Code Compiler",
  booktitle =	"EuroForth '97 Conference Proceedings",
  year = 	"1997",
  address =	"Oxford",
  pages =	"107--116",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26pirker97.ps.gz",
  abstract =	"Writing a sophisticated Forth native code compiler
		 poses some tasks that are not discussed in compiler
		 text books. Some of these tasks arise from specific
		 language features, others from the requirement for very
		 fast compilation to maintain interactivity. In this
		 paper we describe some of the more interesting data
		 structures and algorithms used in the RAFTS
		 prototype.",
}
@Unpublished{ertl97ibm,
  author =	"M. Anton Ertl",
  title =	"Fast high-quality {JavaVM} compilation",
  note = 	"Seminar given at IBM Watson Research Center, Yorktown
		 Heights",
  year = 	"1997",
  month =	aug,
  abstract =	"I will discuss how to apply the technology we
		 developed for our Forth native code compilation project
		 RAFTS to JavaVM just-in-time (JIT) compilation, and
		 describe parts of the CACAO JavaVM just-in-time
		 compiler. Overall, this gives an outline how CACAO will
		 probably look in the future.\par For generating good
		 code for register machines from stack-based languages,
		 we have to avoid the overheads of the straightforward
		 memory-based stack representation; instead, we keep the
		 stack items in registers, and eliminate as many
		 stack-pointer updates as possible by combining
		 them.\par The basis of our native code generation
		 technique is a method for converting the stack-based
		 code for a basic block into a data dependence graph;
		 this allows us to apply standard code generation
		 techniques: we select code with tree parsing, schedule
		 the instructions with list scheduling, and use a simple
		 basic block register allocator; these phases are merged
		 into as few passes as possible to achieve very fast
		 compilation. The global (intraprocedural) register
		 allocation for the JavaVM is performed by another
		 simple register allocator that assigns a register to
		 each stack slot that is alive at a basic block
		 boundary, and one register to each local variable
		 slot.",
}
@Article{ertl97objects,
  author =	"M. Anton Ertl",
  title =	"Yet Another {Forth} Objects Package",
  journal =	"Forth Dimensions",
  year = 	"1997",
  volume =	"19",
  number =	"2",
  pages =	"37--43",
  URL =  	"http://www.complang.tuwien.ac.at/forth/objects/objects.html",
}
@Article{ertl97structs,
  author =	"M. Anton Ertl",
  title =	"Yet Another {Forth} Structures Package",
  journal =	"Forth Dimensions",
  year = 	"1997",
  volume =	"19",
  number =	"3",
  pages =	"13--16",
  URL =  	"http://www.complang.tuwien.ac.at/forth/objects/structs.html",
}
@InProceedings{ertl&krall96,
  author =	"M. Anton Ertl and Andreas Krall",
  title =	"Removing Anti Dependences by Repairing",
  booktitle =	"Compiler Construction (CC'96)",
  year = 	"1996",
  editor =	"Tibor Gyim\'{o}thy",
  volume =	"1060",
  series =	"LNCS",
  publisher =	"Springer",
  address =	"Link{\"o}ping",
  pages =	"33--43",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl-krall96cc.ps.gz",
  abstract =	"Anti dependences (write-after-read dependences)
		 constrain the reordering of instructions and limit the
		 effectiveness of instruction scheduling and software
		 pipelining techniques for superscalar and VLIW
		 processors. Repairing solves this problem: If the
		 definition of a variable is moved before a previous use
		 of that variable, compiler-generated repair code
		 reconstructs the value that the definition destroyed.
		 Repairing features several potential advantages over
		 register renaming, another technique for removing anti
		 dependences: less register pressure, less loop
		 unrolling and fewer move instructions.",
}
@PhdThesis{ertl96diss,
  author =	"M. Anton Ertl",
  title =	"Implementation of Stack-Based Languages on Register
		 Machines",
  school =	"{Technische Universit{\"a}t Wien}",
  year = 	"1996",
  address =	"Austria",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl96diss.ps.gz",
  abstract =	"Languages with programmer-visible stacks (stack-based
		 languages) are used widely, as intermediate languages
		 (e.g., JavaVM, FCode), and as languages for human
		 programmers (e.g., Forth, PostScript). However, the
		 prevalent computer architecture is the register
		 machine. This poses the problem of efficiently
		 implementing stack-based languages on register
		 machines. A straight-forward implementation of the
		 stack consists of a memory area that contains the stack
		 items, and a pointer to the top-of-stack item.\par The
		 basic optimizations explored in this thesis are:
		 Caching the frequently-accessed top-of-stack items in
		 registers reduces stack access overhead, and combining
		 stack-pointer updates eliminates most of them.\par This
		 thesis examines these optimizations in the context of
		 three basic implementation techniques: \begin{itemize}
		 \item For (virtual machine) \textbf{interpreters}, I
		 regard the stack cache in the registers as finite state
		 machine, where the execution of a virtual machine
		 instruction performs a state transition; there are
		 specialized implementations of the virtual machine
		 instructions for each state. \item My
		 \textbf{native-code compilation} technique transforms
		 the programs into standard compiler data structures;
		 then state-of-the-art compiler technology can be
		 applied for optimization and code generation. In
		 particular, stack items are represented by
		 pseudo-registers, which register allocation will
		 (usually) put into machine registers; stack pointer
		 updates are executed symbolically, i.e., at compile
		 time. \item For \textbf{translation to C}, I emphasize
		 simplicity; the optimizer of the C compiler takes care
		 of the complex problems. The translator just has to
		 represent stack items as local C variables, and the C
		 compiler will keep them in registers. As with
		 native-code compilers, (usually) no stack pointer
		 updates at run-time are necessary. \end{itemize} For
		 interpreters, the optimizations eliminate about two
		 real machine instructions per virtual machine
		 instruction, resulting in speedups of 17\%--31\% (for
		 Forth on a DecStation~3100). The techniques presented
		 here for native-code compilation and translation to C
		 achieve a speedup factor of 1.3--3 over traditional
		 native code compilers and more than 3 over a
		 straight-forward translator to C (for Forth on a
		 486DX2/66).",
}
@InProceedings{ertl&pirker96,
  author =	"M. Anton Ertl and Christian Pirker",
  title =	"{RAFTS} for Basic Blocks: A progress report on {Forth}
		 Native Code Compilation",
  booktitle =	"EuroForth '96 Conference Proceedings",
  year = 	"1996",
  address =	"St. Petersburg, Russia",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26pirker96.ps.gz",
  abstract =	"RAFTS is a framework for applying state of the art
		 compiler technology to the compilation of stack-based
		 languages, in particular Forth. This paper describes
		 our experiences and findings in implementing the basic
		 block (straight-line code) part of RAFTS; it also
		 presents empirical results: the basic block part is the
		 simplest part of RAFTS, but without the other parts it
		 is hardly better than existing techniques, because in
		 Forth basic blocks are very short (there are few basic
		 blocks with more than two useful instructions).",
}
@InProceedings{ertl95pldi,
  author =	"M. Anton Ertl",
  title =	"Stack Caching for Interpreters",
  booktitle =	"SIGPLAN Conference on Programming Language Design and
		 Implementation (PLDI'95)",
  year = 	"1995",
  pages =	"315--327",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl95pldi.ps.gz",
  abstract =	"An interpreter can spend a significant part of its
		 execution time on arguments of virtual machine
		 instructions. This paper explores two methods to reduce
		 this overhead for virtual stack machines by caching
		 top-of-stack values in (real machine) registers. The
		 {\em dynamic method} is based on having, for every
		 possible state of the cache, one specialized version of
		 the whole interpreter; the execution of an instruction
		 usually changes the state of the cache and the next
		 instruction is executed in the version corresponding to
		 the new state. In the {\em static method} a state
		 machine that keeps track of the cache state is added to
		 the compiler. Common instructions exist in specialized
		 versions for several states, but it is not necessary to
		 have a version of every instruction for every cache
		 state. Stack manipulation instructions are optimized
		 away.",
}
@InProceedings{ErKr95,
  author =	"M. Anton Ertl and Andreas Krall",
  title =	"High Level Constraints over Finite Domains",
  booktitle =	"Constraint Processing",
  editor =	"Manfred Meyer",
  series =	"LNCS 923",
  publisher =	"Springer",
  pages =	"51--66",
  address =	"Heidelberg",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26krall93.ps.gz",
  year = 	"1995",
}
@InProceedings{ertl&maierhofer95,
  author =	"M. Anton Ertl and Martin Maierhofer",
  title =	"Translating {Forth} to Efficient {C}",
  booktitle =	"EuroForth~'95 Conference Proceedings",
  year = 	"1995",
  address =	"Schloss Dagstuhl, Germany",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26maierhofer95.ps.gz",
  abstract =	"An automatic translator can translate Forth into C
		 code which the current generation of optimizing C
		 compilers compiles to efficient machine code. I.e., the
		 resulting code keeps stack items in registers and
		 rarely updates the stack pointer. This paper presents a
		 simple translation method that produces efficient C
		 code, describes an implementation of the method and
		 presents results achieved with this implementation: The
		 translated code is 4.5--7.5 times faster than Gforth
		 (the fastest measured interpretive system), 1.3--3
		 times faster than BigForth 386 (a native code
		 compiler), and smaller than Gforth's threaded code.",
}
@Article{WaKr95a,
  author =	"Jian Wang and Andreas Krall and M. Anton Ertl",
  title =	"Trace Software Pipelining",
  journal =	"Journal of Computer Science and Technology",
  year = 	"1995",
  volume =	"10",
  number =	"6",
  pages =	"481--490",
}
@InProceedings{WaKr95b,
  author =	"Jian Wang and Andreas Krall and M. Anton Ertl",
  title =	"Decomposed Software Pipelining with reduced Register
		 Requirement",
  booktitle =	"International Conference on Parallel Architectures and
		 Compilation Techniques",
  editor =	"Jean-Luc Gaudiot",
  organization = "IFIP,ACM,IEEE",
  publisher =	"North-Holland",
  pages =	"277--280",
  address =	"Limassol",
  year = 	"1995",
}
@InProceedings{WaKr95c,
  author =	"Jian Wang and Andreas Krall and M. Anton Ertl",
  title =	"Register Requirement for Exploiting Loops' Maximum
		 Instruction-Level Parallelism",
  booktitle =	"The Fourth International Conference for Young Computer
		 Scientists",
  editor =	"Shou Bai and Jianping Fan and Xiaozhong Li",
  publisher =	"Peking University Press",
  pages =	"70--75",
  address =	"Beijing",
  year = 	"1995",
}
@InProceedings{ambrosch+94,
  author =	"Wolfgang Ambrosch and M. Anton Ertl and Felix Beer and
		 Andreas Krall",
  title =	"Dependence-Conscious Global Register Allocation",
  booktitle =	"Programming Languages and System Architectures",
  year = 	"1994",
  editor =	"J{\"u}rg Gutknecht",
  publisher =	"Springer LNCS~782",
  address =	"{Z{\"u}rich}",
  pages =	"125--136",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ambrosch+94.ps.gz",
  abstract =	"Register allocation and instruction scheduling are
		 antagonistic optimizations: Whichever is applied first,
		 it will impede the other. To solve this problem, we
		 propose dependence-conscious colouring, a register
		 allocation method that takes the dependence graph used
		 by the instruction scheduler into consideration.
		 Dependence-conscious colouring consists of two parts:
		 First, the interference graph is built by analysing the
		 dependence graphs, resulting in fewer interference
		 edges and less spilling than the conventional
		 preordering approach. Secondly, during colouring the
		 register selection keeps dependence paths short,
		 ensuring good scheduling. Dependence-conscious
		 colouring reduces the number of interference edges by
		 7\%--24\% and antidependences by 46\%--100\%.",
}
@InProceedings{ertl94l,
  author =	"M. Anton Ertl",
  title =	"Automatic Scoping of Local Variables",
  booktitle =	"EuroForth~'94 Conference Proceedings",
  year = 	"1994",
  address =	"Winchester, UK",
  pages =	"31--37",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl94l.ps.gz",
  abstract =	"In the process of lifting the restrictions on using
		 locals in Forth, an interesting problem poses itself:
		 What does it mean if a local is defined in a control
		 structure? Where is the local visible? Since the user
		 can create every possible control structure in ANS
		 Forth, the answer is not as simple as it may seem.
		 Ideally, the local is visible at a place if the control
		 flow {\em must} pass through the definition of the
		 local to reach this place. This paper discusses locals
		 in general, the visibility problem, its solution, the
		 consequences and the implementation as well as related
		 programming style questions.",
}
@InProceedings{ertl&krall94,
  author =	"M. Anton Ertl and Andreas Krall",
  title =	"Delayed Exceptions --- Speculative Execution of
		 Trapping Instructions",
  booktitle =	"Compiler Construction (CC '94)",
  year = 	"1994",
  publisher =	"Springer LNCS~786",
  address =	"Edinburgh",
  month =	apr,
  pages =	"158--171",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl-krall94cc.ps.gz",
  abstract =	"Superscalar processors, which execute basic blocks
		 sequentially, cannot use much instruction level
		 parallelism. Speculative execution has been proposed to
		 execute basic blocks in parallel. A pure software
		 approach suffers from low performance, because
		 exception-generating instructions cannot be executed
		 speculatively. We propose delayed exceptions, a
		 combination of hardware and compiler extensions that
		 can provide high performance and correct exception
		 handling in compiler-based speculative execution.
		 Delayed exceptions exploit the fact that exceptions are
		 rare. The compiler assumes the typical case (no
		 exceptions), schedules the code accordingly, and
		 inserts run-time checks and fix-up code that ensure
		 correct execution when exceptions do happen.",
}
@InProceedings{b94b,
  author =	"Jian Wang and Andreas Krall and M. Anton Ertl and
		 Christine Eisenbeis",
  title =	"Software pipelining with register allocation and
		 spilling",
  booktitle =	"proceedings of the 27th International Symposium and
		 Workshop on Microprogramming and Microarchitecture
		 (MICRO-27)",
  year = 	"1994",
  month =	dec,
  organization = "ACM and IEEE",
}
@InProceedings{b94a,
  author =	"Jian Wang and Andreas Krall and M. Anton Ertl and
		 Christine Eisenbeis",
  title =	"{Trace Software Pipelining}: A Novel Technique for
		 Parallelization of Loops with Branches",
  booktitle =	"proceedings of International Conference on Parallel
		 Architectures and Compilation Techniques (PACT'94)",
  year = 	"1994",
  month =	aug,
  organization = "IFIP",
  editor =	"Michel Cosnard and G. R. Gao and G. M. Silberman",
  publisher =	"Elsevier Science B. V. (North-Holland)",
}
@InProceedings{ertl93,
  author =	"M. Anton Ertl",
  title =	"A Portable {Forth} Engine",
  booktitle =	"EuroFORTH '93 conference proceedings",
  year = 	"1993",
  address =	"Mari\'ansk\'e L\'azn\`e (Marienbad)",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl93.ps.gz",
  abstract =	"The Forth engine discussed in this paper is written in
		 GNU C, which provides several extensions that are
		 important for Forth implementation. The indirect
		 threaded Forth engine is completely
		 machine-independent, direct threading requires a few
		 machine-specific lines for each machine. Using a
		 portable language like GNU C encourages producing an
		 engine with many primitives. In order to make the
		 development of primitives easier and less error-prone,
		 an automatic tool generates most of the code for a
		 Forth primitive from the stack effect notation, even if
		 the TOS is kept in a register. The engine is combined
		 with the parts of the system written in Forth by
		 loading a machine-independent image file that contains
		 the executable Forth code in relocatable form.",
}
@InProceedings{ertl&krall93deutsch,
  author =	"M. Anton Ertl and Andreas Krall",
  title =	"Benutzerdefinierte Constraints",
  booktitle =	"9.~Workshop Logische Programmierung",
  pages =	"18--22",
  year = 	"1993",
  address =	"Hagen",
  series =	"FernUniversit{\"a}t Hagen 146-10/1993",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26krall93deutsch.ps.gz",
  note = 	"In German",
  abstract =	"{Um kombinatorische Suchprobleme zu l{\"o}sen, werden
		 Konsistenztechniken in Logik-Programmiersprachen
		 verwendet. Bei vielen dieser Probleme reichen
		 allerdings die vordefinierten Constraints nicht aus.
		 Leider sind Constraints, die mit lookahead- oder
		 forward-Deklarationen definiert werden, oft sehr
		 langsam. In dieser Arbeit stellen wir einen neuen
		 Ansatz f{\"u}r benutzerdefinierte Constraints vor.
		 Benutzerdefinierte Constraints sind normale
		 Prolog-Pr{\"a}dikate mit einer zus{\"a}tzlichen
		 Constraint-Deklaration. Sie erm{\"o}glichen eine genaue
		 Kontrolle "uber Laufzeit und Suchraumbeschr{\"a}nkung.
		 Damit lassen sich gewaltige Laufzeitverbesserungen
		 gegen{\"u}ber lookahead-De\-kla\-ra\-tionen
		 erzielen.}",
}
@InProceedings{ertl&krall92,
  author =	"M. Anton Ertl and Andreas Krall",
  title =	"Instruction Scheduling for Complex Pipelines",
  booktitle =	"Compiler Construction (CC'92)",
  year = 	"1992",
  publisher =	"Springer LNCS~641",
  address =	"Paderborn",
  pages =	"207--218",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26krall92.ps.gz",
  abstract =	"We designed heuristics for applying the list
		 scheduling algorithm to processors with complex
		 pipelines. On these processors the pipeline can stall
		 due to resource contention (structural hazards) in
		 addition to the usual data hazards. Conventional
		 heuristics consider only data hazards. Our heuristics
		 reduce structural hazards, too. Code with much
		 instruction-level parallelism is optimized to avoid
		 structural hazards, sequential code is scheduled for
		 reducing data hazards. Embedded in a postpass strategy
		 our scheduler removes 60\%--100\% of the removable
		 stalls from conventionally scheduled code.",
}
@InProceedings{ertl92,
  author =	"M. Anton Ertl",
  title =	"A New Approach to {Forth} Native Code Generation",
  booktitle =	"EuroForth~'92",
  year = 	"1992",
  organization = "MicroProcessor Engineering",
  address =	"Southampton, England",
  pages =	"73--78",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl92.ps.gz",
  abstract =	"RAFTS is a framework for applying state of the art
		 compiler technology to the compilation of Forth. The
		 heart of RAFTS is a simple method for transforming
		 Forth programs into data flow graphs and static single
		 assignment form. Standard code generation and
		 optimization techniques can be applied to programs in
		 these forms. Specifically, RAFTS uses interprocedural
		 register allocation to eliminate nearly all stack
		 accesses. It also removes nearly all stack pointer
		 updates. Inlining and tail call optimization reduce the
		 call overhead. RAFTS compiles all of Forth, including
		 difficult cases like unknown stack heights, {\tt PICK},
		 {\tt ROLL} and {\tt EXECUTE}. And last, but not least,
		 RAFTS is designed for interactive Forth systems; it is
		 not restricted to batch compilers",
}
@InProceedings{ertl&krall91,
  author =	"M. Anton Ertl and Andreas Krall",
  title =	"Optimal Instruction Scheduling Using Constraint Logic
		 Programming",
  booktitle =	"Programming Language Implementation and Logic
		 Programming (PLILP)",
  year = 	"1991",
  publisher =	"Springer LNCS~528",
  address =	"Passau",
  pages =	"75--86",
  URL =  	"http://www.complang.tuwien.ac.at/papers/ertl%26krall91.ps.gz",
  abstract =	"Instruction scheduling is essential for the efficient
		 operation of today's and tomorrow's processors. It can
		 be stated easily and declaratively as a logic program.
		 Consistency techniques embedded in logic programming
		 makes the efficient solution of this problem possible.
		 This paper describes an instruction scheduling program
		 for the Motorola 88100 RISC processor, which minimizes
		 the number of pipeline stalls. The scheduler is written
		 in the constraint logic programming language ARISTO and
		 uses a declarative model of the processor to generate
		 an optimal schedule. The model uses lists of domain
		 variables to represent the pipeline stages and
		 describes the dependencies between instructions by
		 constraints in order to ensure correct scheduling.
		 Although optimal instruction scheduling is NP-complete,
		 the scheduler can be applied to real programs because
		 of the speed gained through consistency techniques.",
}
@MastersThesis{ertl90,
  author =	"M. Anton Ertl",
  title =	"Coroutining und Constraints in der
		 Logik-Programmierung",
  school =	"{Technische Universit{\"a}t Wien}",
  type = 	"Diplomarbeit",
  year = 	"1990",
  address =	"Austria",
  URL =  	"http://www.complang.tuwien.ac.at/Diplomarbeiten/ertl90.ps.gz",
  note = 	"In German",
  abstract =	"Logic Programming is distinguished by the declarative
		 programming style it encourages. Solutions to search
		 problems are specified easily. Some extensions to
		 Prolog are necessary to apply these advantages to
		 real-world problems.\par {\bf Coroutining} provides for
		 data driven goal execution. Using Coroutining it is
		 easier to separate logic and control and to use a
		 declarative programming style.\par {\bf Constraint
		 Logic Programming} liberates unification from its
		 restriction to syntactic equality and extends it to
		 relations on arbitrary computation domains. It also
		 enables the integration of {\bf consistency techniques}
		 in logic programming. Combinatorial problems are solved
		 efficiently using consistency techniques, since they
		 prune the search tree early and provide information for
		 the application of heuristics.\par Coroutining and
		 consistency techniques have been embedded in a Prolog
		 compiler based on the Warren Abstract Machine. The
		 application of these extensions to other Prolog
		 implementations is described.\par The application of
		 the resulting extended logic programming languages is
		 demonstrated using examples from artificial
		 intelligence, operations research, and systems
		 programming.",
}

Some entries might have been ommited on the list above if author/title combination has already occurred. If in doubt check the duplicate list linked on the right of some citations.

Top ten bibliographies for this result page:

146 matches: Papers and talks produced by the Institut für Computersprachen, TU Wien

This page has been served in 0.11 secs.