VC9で、__restrictを試す - espresso3389の日記

VC8(VS2005)以降では、ついに、__restrictがサポートされた。実質的にはC99にもあるrestrictと同じだ。
平たくいえば、特定のポインタについて、どこにもエイリアスがないよということを示すためのもの。
しばらく、盲目的に、__restrictを使ってコードを書いていたんだけど、ふと、その効果が気になったので、VC9

Microsoft (R) 32-bit C/C++ Optimizing Compiler Version 15.00.21022.08 for 80x86

で試すことにした。

テストのためのコードは以下の通り。

void copy_restrict(const int* __restrict from, int* __restrict to, size_t len)
{
  for(size_t i = 0; i < len; i++)
    to[i] = from[i];
}

void copy_normal(const int* from, int* to, size_t len)
{
  for(size_t i = 0; i < len; i++)
    to[i] = from[i];
}

void copy_memcpy(const int* __restrict from, int* __restrict to, size_t len)
{
  memcpy(to, from, len * sizeof(int));
}

だれが見ても明らかなコード。copy_restrictは__restrictがあるけど、copy_normalにはない。copy_memcpyは、参考程度にmemcpyを使ってみるバージョン。
これを/Oxオプション付き(最適化総動員)でコンパイルする。

  ; void copy_restrict(const int* __restrict from, int* __restrict to, size_t len)
  mov ecx, DWORD PTR 16[esp-4]
  test ecx, ecx
  jbe SHORT copy_restrict_end
  push esi
  mov esi, DWORD PTR 8[esp]
  push edi
  mov edi, DWORD PTR 12[esp+4]
  rep movsd
  pop edi
  pop esi
copy_restrict_end:
  ret 0

; void copy_normal(const int* from, int* to, size_t len)
  mov edx, DWORD PTR 16[esp-4]
  test edx, edx
  jbe SHORT copy_normal_end
  mov eax, DWORD PTR 12[esp-4]
  mov ecx, DWORD PTR 8[esp-4]
  sub ecx, eax
  push esi
copy_normal_loop:
  mov esi, DWORD PTR [ecx+eax]
  mov DWORD PTR [eax], esi
  add eax, 4
  sub edx, 1
  jne SHORT copy_normal_loop
  pop esi
copy_normal_end:
  ret 0

結局、__restrictをちゃんと付けるとコピーの本体は、

rep movsd

に集約されている。無駄な条件ジャンプが一つだけあるのは、コピーする個数が0じゃないかのチェックだけ。それに比べると、copyBは、馬鹿正直なコピールーチンだ。

add eax, 4
sub edx, 1

あたりにポインタとカウンタが別に処理されている悲しさを感じる。とはいえ、僕は古い人なので、実は、eaxでメモリ参照ができることに驚いたりしているんですけど。アキュムレータでもアドレス参照できるんですね。今時は・・・。

まぁ、それはさておき、memcpy版は、

mov eax, DWORD PTR 16[esp-4]
mov edx, DWORD PTR 8[esp-4]
lea ecx, DWORD PTR [eax*4]
mov eax, DWORD PTR 12[esp-4]
push ecx
push edx
push eax
call _memcpy
add esp, 12
ret 0

という当たり前すぎる結果に。まぁ、さらにこれを外部から呼び出す場合には、memcpyがインライン展開されるという結果を期待しますけどね。

で、付加実験。

void copy_restrict4(const int* __restrict from, int* __restrict to)
{
  copy_restrict(from, to, 4);
}

void copy_normal4(const int* from, int* to)
{
  copy_normal(from, to, 4);
}

void copy_memcpy4(const int* __restrict from, int* __restrict to)
{
  copy_memcpy(from, to, 4);
}

このコード、copy_normal4には意地悪して__restrictを付けていないんですが、どれでも、

  mov eax, DWORD PTR 8[esp-4]
  mov ecx, DWORD PTR 12[esp-4]
  mov edx, DWORD PTR [eax]
  mov DWORD PTR [ecx], edx
  mov edx, DWORD PTR [eax+4]
  mov DWORD PTR [ecx+4], edx
  mov edx, DWORD PTR [eax+8]
  mov eax, DWORD PTR [eax+12]
  mov DWORD PTR [ecx+8], edx
  mov DWORD PTR [ecx+12], eax
  ret 0

と期待通りに、ループ展開された形になります。ループ展開されてしまうと、当然ながら、__restrictは関係ないわけで。

次に、せっかくなので、もっと嫌らしいコードを。

void calc_sum(int* data, size_t count, int* sum)
{
  for(size_t i = 0; i < count; i++)
    *sum += data[i];
}

void calc_sum_restrict(int* __restrict data, size_t count, int* __restrict sum)
{
  for(size_t i = 0; i < count; i++)
    *sum += data[i];
}

これは、sumがdataのどれかを参照していたら悲しいねっていうパターン。__restrictがついていればsumはレジスタ上で処理されることを期待。

  ; void calc_sum(int* data, size_t count, int* sum)
  mov edx, DWORD PTR 12[esp-4]
  xor eax, eax
  test edx, edx
  jbe SHORT calc_sum_end
  mov ecx, DWORD PTR 16[esp-4]
  push esi
  mov esi, DWORD PTR 8[esp]
  push edi
calc_sum_loop:
  mov edi, DWORD PTR [esi+eax*4]
  add DWORD PTR [ecx], edi  ;;;;; ループ中にメモリに代入
  inc eax
  cmp eax, edx
  jb SHORT calc_sum_loop
  pop edi
  pop esi
calc_sum_end:
  ret 0

  ; void calc_sum_restrict(int* __restrict data, size_t count, int* __restrict sum)
  mov edx, DWORD PTR 12[esp-4]
  xor eax, eax
  test edx, edx
  jbe SHORT calc_sum_restrict_end
  push esi
  mov esi, DWORD PTR 8[esp]
  push edi
  mov edi, DWORD PTR 16[esp+4]
  mov ecx, DWORD PTR [edi]
calc_sum_r_loop:
  add ecx, DWORD PTR [esi+eax*4]
  inc eax
  cmp eax, edx
  jb SHORT calc_sum_r_loop
  mov DWORD PTR [edi], ecx  ;;;;; ループ終了後にメモリに代入
  pop edi
  pop esi
calc_sum_restrict_end:
  ret 0

思った通りになった。まぁ、ライトバックキャッシュも働くだろうから、この程度でそんなに悲しい結果にはならないと思うけど。