関数内関数 - Note

例外はうまく動かない(llvm-g++でも無効にされている)みたいですし、拡張精度浮動小数点型はサポートしてないですし、GCはやる気ないし、ということで関数内関数です。
これがしっかりインプリメントされていれば、まだ気力が維持できるというものです。
a.c

int out(int x, int y)
{
    int in(int z)
    {
        return x + z;
    }
    return in(y);
}

a.ll

; ModuleID = 'a.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
target triple = "mingw32"
    %struct.FRAME.out = type { i32 }

define i32 @out(i32 %x, i32 %y) {
entry:
    %x_addr = alloca i32        ; <i32*> [#uses=2]
    %y_addr = alloca i32        ; <i32*> [#uses=2]
    %retval = alloca i32, align 4        ; <i32*> [#uses=2]
    %FRAME.0 = alloca %struct.FRAME.out, align 4        ; <%struct.FRAME.out*> [#uses=2]
    %tmp = alloca i32, align 4        ; <i32*> [#uses=2]
    "alloca point" = bitcast i32 0 to i32        ; <i32> [#uses=0]
    store i32 %x, i32* %x_addr
    store i32 %y, i32* %y_addr
    %tmp1 = getelementptr %struct.FRAME.out* %FRAME.0, i32 0, i32 0        ; <i32*> [#uses=1]
    %tmp2 = load i32* %x_addr        ; <i32> [#uses=1]
    store i32 %tmp2, i32* %tmp1
    %tmp3 = load i32* %y_addr        ; <i32> [#uses=1]
    %tmp4 = call i32 @in.930( %struct.FRAME.out* %FRAME.0 inreg , i32 %tmp3 )        ; <i32> [#uses=1]
    store i32 %tmp4, i32* %tmp
    %tmp5 = load i32* %tmp        ; <i32> [#uses=1]
    store i32 %tmp5, i32* %retval
    br label %return

return:        ; preds = %entry
    %retval6 = load i32* %retval        ; <i32> [#uses=1]
    ret i32 %retval6
}

define internal i32 @in.930(%struct.FRAME.out* inreg  %CHAIN.1, i32 %z) {
entry:
    %CHAIN.1_addr = alloca %struct.FRAME.out*        ; <%struct.FRAME.out**> [#uses=2]
    %z_addr = alloca i32        ; <i32*> [#uses=2]
    %retval = alloca i32, align 4        ; <i32*> [#uses=2]
    %tmp = alloca i32, align 4        ; <i32*> [#uses=2]
    %tmp1 = alloca i32, align 4        ; <i32*> [#uses=2]
    "alloca point" = bitcast i32 0 to i32        ; <i32> [#uses=0]
    store %struct.FRAME.out* %CHAIN.1, %struct.FRAME.out** %CHAIN.1_addr
    store i32 %z, i32* %z_addr
    %tmp2 = load %struct.FRAME.out** %CHAIN.1_addr        ; <%struct.FRAME.out*> [#uses=1]
    %tmp3 = getelementptr %struct.FRAME.out* %tmp2, i32 0, i32 0        ; <i32*> [#uses=1]
    %tmp4 = load i32* %tmp3        ; <i32> [#uses=1]
    store i32 %tmp4, i32* %tmp
    %tmp5 = load i32* %tmp        ; <i32> [#uses=1]
    %tmp6 = load i32* %z_addr        ; <i32> [#uses=1]
    %tmp7 = add i32 %tmp5, %tmp6        ; <i32> [#uses=1]
    store i32 %tmp7, i32* %tmp1
    %tmp8 = load i32* %tmp1        ; <i32> [#uses=1]
    store i32 %tmp8, i32* %retval
    br label %return

return:        ; preds = %entry
    %retval9 = load i32* %retval        ; <i32> [#uses=1]
    ret i32 %retval9
}

親のローカル変数全部構造体にぶち込む作りだー。
llc。
a.s

    .text
    .align    16
    .globl    _out
    .def     _out;    .scl    2;    .type    32;    .endef
_out:
    subl $28, %esp
    movl 32(%esp), %eax
    movl %eax, 24(%esp)
    movl 36(%esp), %eax
    movl %eax, 20(%esp)
    movl 24(%esp), %eax
    movl %eax, 8(%esp)
    movl 20(%esp), %eax
    movl %eax, (%esp)
    leal 8(%esp), %eax
    call _in.930
    movl %eax, 4(%esp)
    movl %eax, 16(%esp)
LBB1_1:    #return
    movl 16(%esp), %eax
    addl $28, %esp
    ret


    .align    16
    .def     _in.930;    .scl    3;    .type    32;    .endef
_in.930:
    subl $20, %esp
    movl %eax, 16(%esp)
    movl 24(%esp), %eax
    movl %eax, 12(%esp)
    movl 16(%esp), %eax
    movl (%eax), %eax
    movl %eax, 4(%esp)
    addl 12(%esp), %eax
    movl %eax, (%esp)
    movl %eax, 8(%esp)
LBB2_1:    #return
    movl 8(%esp), %eax
    addl $20, %esp
    ret

llcは本当に機械的なトランスレートしかしないな……。
いきなりやる気減。親のebp触れよ！
もっとも、llvm.frameaddressはあるのですが、インライン展開どころかもっと過激に最適化するのでデバッグ以外じゃ使えねーよ、などと書いてあります。
ようしそれなら見せてもらおうその最適化能力。
optの使い方がいまいちわかりませんでしたので、llvm-ldに-O5を渡して最適化します。
a-o.ll

; ModuleID = 'a.out.bc'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
target triple = "mingw32"
    %struct.FRAME.out = type { i32 }

define i32 @out(i32 %x, i32 %y) {
entry:
    %tmp7.i = add i32 %x, %y        ; <i32> [#uses=1]
    ret i32 %tmp7.i
}

a-o.s

    .text
    .align    16
    .globl    _out
    .def     _out;    .scl    2;    .type    32;    .endef
_out:
    movl 4(%esp), %eax
    addl 8(%esp), %eax
    ret

見事にインライン化されました。
次は勿論downward closureですね。
thunk.c

#include<stdio.h>

void f(void (*c)(void))
{
    c();
}

int main(int argc, char **argv)
{
    void i(void)
    {
        printf("%d\n", argc);
    }
    f(i);
    return 0;
}

もしllvm-gccにAdaが含まれるようになりましたら、これがどうなるかが全てを分けます。大袈裟です。
thunk.ll

; ModuleID = 'thunk.bc'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
target triple = "mingw32"
    %struct.FRAME.main = type { i32, %struct.__builtin_trampoline }
    %struct.__builtin_trampoline = type { [10 x i8] }
@.str = internal constant [4 x i8] c"%d\0A\00"        ; <[4 x i8]*> [#uses=1]

define void @f(void ()* %c) {
entry:
    %c_addr = alloca void ()*        ; <void ()**> [#uses=2]
    "alloca point" = bitcast i32 0 to i32        ; <i32> [#uses=0]
    store void ()* %c, void ()** %c_addr
    %tmp = load void ()** %c_addr        ; <void ()*> [#uses=1]
    call void %tmp( )
    br label %return

return:        ; preds = %entry
    ret void
}

define i32 @main(i32 %argc, i8** %argv) {
entry:
    %argc_addr = alloca i32        ; <i32*> [#uses=2]
    %argv_addr = alloca i8**        ; <i8***> [#uses=1]
    %retval = alloca i32, align 4        ; <i32*> [#uses=2]
    %FRAME.4 = alloca %struct.FRAME.main, align 16        ; <%struct.FRAME.main*> [#uses=4]
    %tmp = alloca i8*, align 4        ; <i8**> [#uses=2]
    %tmp1 = alloca void ()*, align 4        ; <void ()**> [#uses=2]
    %tmp2 = alloca i32, align 4        ; <i32*> [#uses=2]
    "alloca point" = bitcast i32 0 to i32        ; <i32> [#uses=0]
    store i32 %argc, i32* %argc_addr
    store i8** %argv, i8*** %argv_addr
    %tmp3 = getelementptr %struct.FRAME.main* %FRAME.4, i32 0, i32 0        ; <i32*> [#uses=1]
    %tmp4 = load i32* %argc_addr        ; <i32> [#uses=1]
    store i32 %tmp4, i32* %tmp3
    %tmp5 = getelementptr %struct.FRAME.main* %FRAME.4, i32 0, i32 1        ; <%struct.__builtin_trampoline*> [#uses=1]
    %tmp56 = bitcast %struct.__builtin_trampoline* %tmp5 to i8*        ; <i8*> [#uses=1]
    %FRAME.47 = bitcast %struct.FRAME.main* %FRAME.4 to i8*        ; <i8*> [#uses=1]
    call void @__builtin_init_trampoline( i8* %tmp56, i8* bitcast (void (%struct.FRAME.main* inreg )* @i.1376 to i8*), i8* %FRAME.47 )
    %tmp8 = getelementptr %struct.FRAME.main* %FRAME.4, i32 0, i32 1        ; <%struct.__builtin_trampoline*> [#uses=1]
    %tmp89 = bitcast %struct.__builtin_trampoline* %tmp8 to i8*        ; <i8*> [#uses=1]
    %tmp10 = call i8* @__builtin_adjust_trampoline( i8* %tmp89 )        ; <i8*> [#uses=1]
    store i8* %tmp10, i8** %tmp
    %tmp11 = load i8** %tmp        ; <i8*> [#uses=1]
    %tmp1112 = bitcast i8* %tmp11 to void ()*        ; <void ()*> [#uses=1]
    store void ()* %tmp1112, void ()** %tmp1
    %tmp13 = load void ()** %tmp1        ; <void ()*> [#uses=1]
    call void @f( void ()* %tmp13 )
    store i32 0, i32* %tmp2
    %tmp14 = load i32* %tmp2        ; <i32> [#uses=1]
    store i32 %tmp14, i32* %retval
    br label %return

return:        ; preds = %entry
    %retval15 = load i32* %retval        ; <i32> [#uses=1]
    ret i32 %retval15
}

declare void @__builtin_init_trampoline(i8*, i8*, i8*)

declare i8* @__builtin_adjust_trampoline(i8*)

define internal void @i.1376(%struct.FRAME.main* inreg  %CHAIN.5) {
entry:
    %CHAIN.5_addr = alloca %struct.FRAME.main*        ; <%struct.FRAME.main**> [#uses=2]
    %tmp = alloca i32, align 4        ; <i32*> [#uses=2]
    "alloca point" = bitcast i32 0 to i32        ; <i32> [#uses=0]
    store %struct.FRAME.main* %CHAIN.5, %struct.FRAME.main** %CHAIN.5_addr
    %tmp1 = load %struct.FRAME.main** %CHAIN.5_addr        ; <%struct.FRAME.main*> [#uses=1]
    %tmp2 = getelementptr %struct.FRAME.main* %tmp1, i32 0, i32 0        ; <i32*> [#uses=1]
    %tmp3 = load i32* %tmp2        ; <i32> [#uses=1]
    store i32 %tmp3, i32* %tmp
    %tmp4 = getelementptr [4 x i8]* @.str, i32 0, i32 0        ; <i8*> [#uses=1]
    %tmp5 = load i32* %tmp        ; <i32> [#uses=1]
    %tmp6 = call i32 (i8*, ...)* @printf( i8* %tmp4, i32 %tmp5 )        ; <i32> [#uses=0]
    br label %return

return:        ; preds = %entry
    ret void
}

declare i32 @printf(i8*, ...)

へえ……トランポリンをサポートしていますね。
thunk.s

    .text
    .align    16
    .globl    _f
    .def     _f;    .scl    2;    .type    32;    .endef
_f:
    subl $12, %esp
    movl 16(%esp), %eax
    movl %eax, 8(%esp)
    call *%eax
LBB1_1:    #return
    addl $12, %esp
    ret


    .align    16
    .globl    _main
    .def     _main;    .scl    2;    .type    32;    .endef
_main:
    subl $60, %esp
    movl %ebp, 56(%esp)
    leal 56(%esp), %ebp
    andl $4294967280, %esp
    movl $16, %eax
    call __alloca
    movl %esi, -4(%ebp)
    call ___main
    fnstcw -54(%ebp)
    movb $2, -53(%ebp)
    fldcw -54(%ebp)
    movl 8(%ebp), %eax
    movl %eax, -8(%ebp)
    movl 12(%ebp), %eax
    movl %eax, -12(%ebp)
    movl -8(%ebp), %eax
    movl %eax, -40(%ebp)
    subl $16, %esp
    leal -40(%ebp), %eax
    movl %eax, 8(%esp)
    leal -36(%ebp), %esi
    movl %esi, (%esp)
    movl $_i.1376, 4(%esp)
    call ___builtin_init_trampoline
    addl $16, %esp
    subl $16, %esp
    movl %esi, (%esp)
    call ___builtin_adjust_trampoline
    addl $16, %esp
    movl %eax, -44(%ebp)
    movl %eax, -48(%ebp)
    subl $16, %esp
    movl %eax, (%esp)
    call _f
    addl $16, %esp
    movl $0, -52(%ebp)
    movl $0, -16(%ebp)
LBB2_1:    #return
    movl -16(%ebp), %eax
    movl -4(%ebp), %esi
    movl %ebp, %esp
    popl %ebp
    ret


    .align    16
    .def     _i.1376;    .scl    3;    .type    32;    .endef
_i.1376:
    subl $28, %esp
    movl %eax, 24(%esp)
    movl (%eax), %eax
    movl %eax, 20(%esp)
    movl %eax, 4(%esp)
    movl $_.str, (%esp)
    call _printf
LBB3_1:    #return
    addl $28, %esp
    ret
    .data
_.str:                # .str
    .asciz    "%d\n"

    .def     ___builtin_adjust_trampoline;    .scl    2;    .type    32;    .endef
    .def     ___builtin_init_trampoline;    .scl    2;    .type    32;    .endef
    .def     _printf;    .scl    2;    .type    32;    .endef

___builtin_init_trampolineと___builtin_adjust_trampolineというランタイムが必要みたいです。
で、これを最適化しますと。
thunk-o.ll

; ModuleID = 'thunk_o.bc'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
target triple = "mingw32"
    %struct.FRAME.main = type { i32, %struct.__builtin_trampoline }
    %struct.__builtin_trampoline = type { [10 x i8] }
@.str = internal constant [4 x i8] c"%d\0A\00"        ; <[4 x i8]*> [#uses=1]

define i32 @main(i32 %argc, i8** %argv) {
entry:
    %FRAME.4 = alloca %struct.FRAME.main, align 16        ; <%struct.FRAME.main*> [#uses=3]
    %tmp3 = getelementptr %struct.FRAME.main* %FRAME.4, i32 0, i32 0        ; <i32*> [#uses=1]
    store i32 %argc, i32* %tmp3
    %tmp56 = getelementptr %struct.FRAME.main* %FRAME.4, i32 0, i32 1, i32 0, i32 0        ; <i8*> [#uses=2]
    %FRAME.47 = bitcast %struct.FRAME.main* %FRAME.4 to i8*        ; <i8*> [#uses=1]
    call void @__builtin_init_trampoline( i8* %tmp56, i8* bitcast (void (%struct.FRAME.main* inreg )* @i.1376 to i8*), i8* %FRAME.47 )
    %tmp10 = call i8* @__builtin_adjust_trampoline( i8* %tmp56 )        ; <i8*> [#uses=1]
    %tmp1112 = bitcast i8* %tmp10 to void ()*        ; <void ()*> [#uses=1]
    call void %tmp1112( )
    ret i32 0
}

declare void @__builtin_init_trampoline(i8*, i8*, i8*)

declare i8* @__builtin_adjust_trampoline(i8*)

define internal void @i.1376(%struct.FRAME.main* inreg  %CHAIN.5) {
entry:
    %tmp2 = getelementptr %struct.FRAME.main* %CHAIN.5, i32 0, i32 0        ; <i32*> [#uses=1]
    %tmp3 = load i32* %tmp2        ; <i32> [#uses=1]
    %tmp6 = call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @.str, i32 0, i32 0), i32 %tmp3 )        ; <i32> [#uses=0]
    ret void
}

declare i32 @printf(i8*, ...)

___builtin_init_trampolineと___builtin_adjust_trampolineってintrinsicじゃ無いのか！
外部関数が間に入っているせいで関数内関数のコールバックはインライン展開できない、と。まあgccでもインライン展開できませんが。
thunk-o.s

    .text
    .align    16
    .globl    _main
    .def     _main;    .scl    2;    .type    32;    .endef
_main:
    subl $44, %esp
    movl %ebp, 40(%esp)
    leal 40(%esp), %ebp
    andl $4294967280, %esp
    movl $16, %eax
    call __alloca
    movl %esi, -4(%ebp)
    call ___main
    fnstcw -26(%ebp)
    movb $2, -25(%ebp)
    fldcw -26(%ebp)
    movl 8(%ebp), %eax
    movl %eax, -24(%ebp)
    subl $16, %esp
    leal -24(%ebp), %eax
    movl %eax, 8(%esp)
    leal -20(%ebp), %esi
    movl %esi, (%esp)
    movl $_i.1376, 4(%esp)
    call ___builtin_init_trampoline
    addl $16, %esp
    subl $16, %esp
    movl %esi, (%esp)
    call ___builtin_adjust_trampoline
    addl $16, %esp
    call *%eax
    xorl %eax, %eax
    movl -4(%ebp), %esi
    movl %ebp, %esp
    popl %ebp
    ret


    .align    16
    .def     _i.1376;    .scl    3;    .type    32;    .endef
_i.1376:
    subl $12, %esp
    movl (%eax), %eax
    movl %eax, 4(%esp)
    movl $_.str, (%esp)
    call _printf
    addl $12, %esp
    ret
    .data
_.str:                # .str
    .asciz    "%d\n"

    .def     ___builtin_adjust_trampoline;    .scl    2;    .type    32;    .endef
    .def     ___builtin_init_trampoline;    .scl    2;    .type    32;    .endef
    .def     _printf;    .scl    2;    .type    32;    .endef

トランポリンだけ見るとgccと等価ですのでそれはいいです。
それよりひどいのが親の引数の扱い。
丁寧にローカルにコピーし直すなよ！最初から親のebpを(ry
だめだこのVMは。何のためにスタックフレーム上でebpがリンクリストを構成してると思って(ry
LLVMには未来を見るべきなのであって、現状は見ちゃだめなんでしょうね……。
ところで、呼び出し規約などもひっくるめて真面目にやるなら、こんなめんどくさい中間言語になるなあ。

%out = type i32 (caller i32, callee i32, save i32 eip, save i32 ebp, local i32)
define i32 @in(callee i32 arg, save i32 eip, save frameof<%out> ebp){ ...

callerは呼び出し側が開放する引数。全部そうなってるのがcdecl。calleeは関数側が開放する引数。全部そうなってるのがstdcall。他にregisterがある。
saveで少なくともeipとebpが保存される位置を示さないといけない。ebpを省略したらebpを省略する最適化OKの意味。
localは固定位置に確保するローカル変数。frameof<%関数型>で特定の関数型の中のebpを指す意味のつもり。%関数型*だと関数ポインタになってしまうので。
ここまでやるなら「最適化したあとのプログラムが使うスタックの量だけSPを移動」する命令も欲しいな……。
この手のバックエンドを本気で使おうとするなら、CPUの扱える数値型が全部使えること、OSがSEHをサポートしてるならそれをサポートすること、一般的な呼び出し規約は全部サポートむしろコルーチン用にスタックではなくヒープを使う関数を書けるぐらいの柔軟性、あと関数内関数を少なくともその辺の大した最適化なんかしないPascal コンパイラと同程度には扱えること、程度は必須と思う。最適化能力よりも優先的に。勿論最適化能力としてはバックエンドでしかできないレジスタ割りつけのほうがフロントエンドでもなんとかなるインライン化より重要。バックエンドのせいでできることが限定されたり、最適化なんかしないベタ機械語出力するコンパイラより効率が落ちたりしたら、嫌じゃないですか。
贅沢でしょうか？