本以为写写异构代码也不算一件难事。毕竟做 Java 开发,日常也写些 C 代码,无论如何也不至于到棘手到地步。哪曾想,JNI 开发不难,但问题颇多。

为了排查一个 JNI 的堆外内存泄露问题,简直是绞尽脑汁而不得其门。最后查找的问题原因也特别简单,但凡认真学习了 JNI 开发都能够避免。

问题代码类似于下列代码:

#include "Main.h"

JNIEXPORT void JNICALL Java_Main_sayHi (JNIEnv *env, jobject jobj, jstring jstr)
{
    const char *str = (*env)->GetStringUTFChars(env, jstr, 0);
    int len = (*env)->GetStringUTFLength(env, jstr);

    // ... some code omitted 

    return ;
}

问题表现

最初的表现很明显,2核4G的机器,2GB的 Java 堆,最终物理内存不够用,导致 Java 进程被强制杀掉。几乎毫无疑问,就是 Java 堆外内存引发的问题。不过遗憾的是,物理内存不够用,没有留下足够多的堆栈详情。另外找了一台快要把内存打满的机器,打印信息。

$ pmap -x 27765 | sort -n -k3 | tail -n 50
00007f460c9ac000   55632   23896   23896 rw---   [ anon ]
00007f45d0000000   38980   38980   38980 rw---   [ anon ]
00007f45dc000000   45980   39512   39512 rw---   [ anon ]
00007f45e0000000   46120   39652   39652 rw---   [ anon ]
00007f45d4000000   42160   41744   41744 rw---   [ anon ]
00007f45c8000000   45992   45992   45992 rw---   [ anon ]
00007f4610000000   48896   48872   48872 rwx--   [ anon ]
00007f45f8000000   51084   51084   51084 rw---   [ anon ]
00007f4600000000   52328   52328   52328 rw---   [ anon ]
00007f4608000000   58332   55452   55452 rw---   [ anon ]
00000000014f8000   57036   56908   56908 rw---   [ anon ]
00007f45d8000000   63760   57292   57292 rw---   [ anon ]
00007f45e8000000   58488   58488   58488 rw---   [ anon ]
00007f4604000000   59912   59912   59912 rw---   [ anon ]
00007f45fc000000   60080   60080   60080 rw---   [ anon ]
00007f45e4000000   64460   64452   64452 rw---   [ anon ]
00007f45f4000000   65072   65072   65072 rw---   [ anon ]
00007f4620000000   65536   65536   65536 rw---   [ anon ]
0000000080000000 2105600 2008240 2008240 rw---   [ anon ]   # 2GB 的 Java 堆
total kB         4885820 3040036 3030476

比较多的内存块都在 64M 左右大小。确实也很明显,就是申请的内存没有得到释放。

问题排查

C 不似 Java,没有 Garbage Collection 。所有申请的内存都必须主动进行释放。唯一的问题就是 GetStringUTFChars 得到的内存是否会被 JVM 统一管理。

结果当然是不会,虽然是 Java 与 C 衔接的胶水代码,但这部分得到的内存需要自行管理。不过,JNI 也提供了函数来进行释放。

/* from src/share/vm/prims/jni.cpp */
JNI_ENTRY(const char*, jni_GetStringUTFChars(JNIEnv *env, jstring string, jboolean *isCopy))
  JNIWrapper("GetStringUTFChars");
  char* result = NULL;
  oop java_string = JNIHandles::resolve_non_null(string);
  if (java_lang_String::value(java_string) != NULL) {
    size_t length = java_lang_String::utf8_length(java_string);
    /* 申请堆内存 */
    result = AllocateHeap(length + 1, mtInternal, 0, AllocFailStrategy::RETURN_NULL);
    if (result != NULL) {
      java_lang_String::as_utf8_string(java_string, result, (int) length + 1);
      if (isCopy != NULL) {
        *isCopy = JNI_TRUE;
      }
    }
  }
  return result;
JNI_END

/* from src/share/vm/memory/allocation.inline.hpp */
inline char* AllocateHeap(size_t size, MEMFLAGS flags,
    const NativeCallStack& stack,
    AllocFailType alloc_failmode = AllocFailStrategy::EXIT_OOM) {
    // 申请内存
    char* p = (char*) os::malloc(size, flags, stack);
#ifdef ASSERT
    if (PrintMallocFree) trace_heap_malloc(size, "AllocateHeap", p);
#endif
    if (p == NULL && alloc_failmode == AllocFailStrategy::EXIT_OOM) {
        vm_exit_out_of_memory(size, OOM_MALLOC_ERROR, "AllocateHeap");
    }
    return p;
}

/* from src/share/vm/prims/jni.cpp */
JNI_LEAF(void, jni_ReleaseStringUTFChars(JNIEnv *env, jstring str, const char *chars))
  JNIWrapper("ReleaseStringUTFChars");
  if (chars != NULL) {
    // 释放内存
    FreeHeap((char*) chars);
  }
);
JNI_END

/* from src/share/vm/memory/allocation.inline.hpp */
inline void FreeHeap(void* p, MEMFLAGS memflags = mtInternal) {
#ifdef ASSERT
    if (PrintMallocFree) trace_heap_free(p);
#endif
    os::free(p, memflags);
}

从名称就可以看出来,GetStringUTFCharsReleaseStringUTFChars 是相对的方法。而且默认走的都是 Glibc 的 mallocfree

问题验证

为了验证问题的原因和最终的现象,整了一段代码来测试效果。

import java.io.IOException;

public class Main {

    static {
        System.load("/root/jni/Main.so");
    }

    private static final String value = "AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHH";

    public native void sayHi(String value);

    public static void main(String[] args) throws IOException {
        Main main = new Main();
        while (true)
        {
            for (int i=0;i<100000;i++) {
                char c1 = (char) ('A' + Math.random() * 26);
                char c2 = (char) ('a' + Math.random() * 26);
                main.sayHi(value + c1 + c2);
            }
            System.gc();
            int val = System.in.read();
            if (val == '1')
                break;
        }
    }
}
#include "Main.h"

JNIEXPORT void JNICALL Java_Main_sayHi (JNIEnv *env, jobject jobj, jstring jstr)
{
    const char *str = (*env)->GetStringUTFChars(env, jstr, 0);
    int len = (*env)->GetStringUTFLength(env, jstr);
    printf("%p\n", str);
#ifdef SUCCESS
    (*env)->ReleaseStringUTFChars(env, jstr, str);
#endif
    return ;
}

测试用的源代码

$ java Main
$ pmap -x 22690 | sort -n -k3
# ...
00007f0d44000000   65536   65500   65500 rw---   [ anon ]
00007f0d04000000  131072  131068  131068 rw---   [ anon ]
00007f0cc4000000  131072  131072  131072 rw---   [ anon ]
00007f0ccc000000  131072  131072  131072 rw---   [ anon ]
00007f0cd4000000  131072  131072  131072 rw---   [ anon ]
00007f0cdc000000  131072  131072  131072 rw---   [ anon ]
00007f0ce4000000  131072  131072  131072 rw---   [ anon ]
00007f0cec000000  131072  131072  131072 rw---   [ anon ]
00007f0cf4000000  131072  131072  131072 rw---   [ anon ]
00007f0cfc000000  131072  131072  131072 rw---   [ anon ]
total kB         3822700 1293952 1276900 

确实引起了内存的增长,而将内存 dump 下了之后,也看到了大量字符串。

$ PID=23756; grep rw-p /proc/${PID}/maps | sed -n 's/^\([0-9a-f]*\)-\([0-9a-f]*\) .*$/\1 \2/p' | while read start stop;  \
do \
    gdb --batch --pid ${PID} -ex "dump memory ${PID}-$start-$stop.dump 0x$start 0x$stop";  \
done 
$ strings 23756-7f6f6c000000-7f6f70000000.dump| grep 'AAAA'
# ...
AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHVn
AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHXp
AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHFn

待到添加上 ReleaseStringUTFChars ,申请到内存得到释放,确实解决了这个问题。

总结

定位问题确实看起来不难,但这种异构代码,真正难点在于定位问题。这期间走了不少的弯路,也无法确定究竟是对是错。毕竟问题的原因多种多样:

  1. 可能是 Java 代码用了诸如 Zip 等容易出现堆外内存泄露的方法
  2. 不当使用 JNI (就像是上文描述的问题)
  3. 再有就是第三方 C 库使用不当,或者其本身存在 BUG。

这是第一次排查异构代码的问题,颇多难以分辨,无法做定论之事。

再就是 C 的平台相关性实在是太过于严重了,加之本就不太熟悉,搞个编译新的库文件就好一番折腾。

  __                    __                  
 / _| __ _ _ __   __ _ / _| ___ _ __   __ _ 
| |_ / _` | '_ \ / _` | |_ / _ \ '_ \ / _` |
|  _| (_| | | | | (_| |  _|  __/ | | | (_| |
|_|  \__,_|_| |_|\__, |_|  \___|_| |_|\__, |
                 |___/                |___/