diff --git a/lib/Makefile b/lib/Makefile
index ea372a05..6e2a85d8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -54,7 +54,7 @@ WIN_O = crt1.o crt1w.o wincrt1.o wincrt1w.o dllcrt1.o dllmain.o
 
 OBJ-i386 = $(I386_O) $(BCHECK_O) $(DSO_O)
 OBJ-x86_64 = $(X86_64_O) va_list.o $(BCHECK_O) $(DSO_O)
-OBJ-x86_64-osx = $(X86_64_O) va_list.o
+OBJ-x86_64-osx = $(X86_64_O) va_list.o $(BCHECK_O)
 OBJ-i386-win32 = $(I386_O) chkstk.o $(B_O) $(WIN_O)
 OBJ-x86_64-win32 = $(X86_64_O) chkstk.o $(B_O) $(WIN_O)
 OBJ-arm64 = $(ARM64_O) $(BCHECK_O) $(DSO_O)
diff --git a/lib/bcheck.c b/lib/bcheck.c
index f1f01b91..ec861df9 100644
--- a/lib/bcheck.c
+++ b/lib/bcheck.c
@@ -27,6 +27,7 @@
  && !defined(__FreeBSD_kernel__) \
  && !defined(__DragonFly__) \
  && !defined(__OpenBSD__) \
+ && !defined(__APPLE__) \
  && !defined(__NetBSD__)
 #include <malloc.h>
 #endif
@@ -98,7 +99,14 @@ static CRITICAL_SECTION bounds_sem;
 #include <pthread.h>
 #include <dlfcn.h>
 #include <errno.h>
-#if 0
+#ifdef __APPLE__
+#include <dispatch/dispatch.h>
+static dispatch_semaphore_t bounds_sem;
+#define INIT_SEM()             bounds_sem = dispatch_semaphore_create(1)
+#define EXIT_SEM()             dispatch_release(*(dispatch_object_t*)&bounds_sem)
+#define WAIT_SEM()             if (use_sem) dispatch_semaphore_wait(bounds_sem, DISPATCH_TIME_FOREVER)
+#define POST_SEM()             if (use_sem) dispatch_semaphore_signal(bounds_sem)
+#elif 0
 #include <semaphore.h>
 static sem_t bounds_sem;
 #define INIT_SEM()             sem_init (&bounds_sem, 0, 1)
@@ -198,7 +206,7 @@ DLL_EXPORT void * __bound_ptr_indir12(void *p, size_t offset);
 DLL_EXPORT void * __bound_ptr_indir16(void *p, size_t offset);
 DLL_EXPORT void FASTCALL __bound_local_new(void *p1);
 DLL_EXPORT void FASTCALL __bound_local_delete(void *p1);
-void __bound_init(size_t *);
+void __bound_init(size_t *, int);
 void __bound_main_arg(char **p);
 void __bound_exit(void);
 #if !defined(_WIN32)
@@ -780,9 +788,10 @@ void __bound_siglongjmp(jmp_buf env, int val)
 #pragma GCC diagnostic pop
 #endif
 
-void __bound_init(size_t *p)
+void __bound_init(size_t *p, int mode)
 {
-    dprintf(stderr, "%s, %s(): start\n", __FILE__, __FUNCTION__);
+    dprintf(stderr, "%s, %s(): start %s\n", __FILE__, __FUNCTION__,
+            mode < 0 ? "lazy" : mode == 0 ? "normal use" : "for -run");
 
     if (inited) {
         WAIT_SEM();
@@ -800,9 +809,12 @@ void __bound_init(size_t *p)
 
 #if MALLOC_REDIR
     {
-        void *addr = RTLD_NEXT;
+        void *addr = mode > 0 ? RTLD_DEFAULT : RTLD_NEXT;
 
-        /* tcc -run required RTLD_DEFAULT. Normal usage requires RTLD_NEXT */
+        /* tcc -run required RTLD_DEFAULT. Normal usage requires RTLD_NEXT,
+           but using RTLD_NEXT with -run segfaults on MacOS in dyld as the
+           generated code segment isn't registered with dyld and hence the
+           caller image of dlsym isn't known to it */
         *(void **) (&malloc_redir) = dlsym (addr, "malloc");
         if (malloc_redir == NULL) {
             dprintf(stderr, "%s, %s(): use RTLD_DEFAULT\n",
@@ -877,6 +889,9 @@ void __bound_init(size_t *p)
     WAIT_SEM ();
 
 #if HAVE_CTYPE
+#ifdef __APPLE__
+#warning fill out for MacOS (see <_ctype.h> and <runetype.h>)
+#else
     /* XXX: Does not work if locale is changed */
     tree = splay_insert((size_t) __ctype_b_loc(),
                         sizeof (unsigned short *), tree);
@@ -891,6 +906,7 @@ void __bound_init(size_t *p)
     tree = splay_insert((size_t) (*__ctype_toupper_loc() - 128),
                         384 * sizeof (__int32_t), tree);
 #endif
+#endif
 #if HAVE_ERRNO
     tree = splay_insert((size_t) (&errno), sizeof (int), tree);
 #endif
@@ -985,7 +1001,7 @@ void __attribute__((destructor)) __bound_exit(void)
     dprintf(stderr, "%s, %s():\n", __FILE__, __FUNCTION__);
 
     if (inited) {
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__APPLE__)
         if (print_heap) {
             extern void __libc_freeres (void);
             __libc_freeres ();
@@ -1097,7 +1113,7 @@ void *__bound_malloc(size_t size, const void *caller)
 #if MALLOC_REDIR
     /* This will catch the first dlsym call from __bound_init */
     if (malloc_redir == NULL) {
-        __bound_init (0);
+        __bound_init (0, -1);
         if (malloc_redir == NULL) {
             ptr = &initial_pool[pool_index];
             pool_index = (pool_index + size + 15) & ~15;
@@ -1259,7 +1275,7 @@ void *__bound_calloc(size_t nmemb, size_t size)
 #if MALLOC_REDIR
     /* This will catch the first dlsym call from __bound_init */
     if (malloc_redir == NULL) {
-        __bound_init (0);
+        __bound_init (0, -1);
         if (malloc_redir == NULL) {
             ptr = &initial_pool[pool_index];
             pool_index = (pool_index + size + 15) & ~15;
diff --git a/lib/bt-exe.c b/lib/bt-exe.c
index 350261a8..a0e8fad8 100644
--- a/lib/bt-exe.c
+++ b/lib/bt-exe.c
@@ -12,12 +12,12 @@ int (*__rt_error)(void*, void*, const char *, va_list);
 #endif
 
 __declspec(dllexport)
-void __bt_init(rt_context *p, int num_callers)
+void __bt_init(rt_context *p, int num_callers, int mode)
 {
     __attribute__((weak)) int main();
-    __attribute__((weak)) void __bound_init(void*);
+    __attribute__((weak)) void __bound_init(void*, int);
     struct rt_context *rc = &g_rtctxt;
-    //fprintf(stderr, "__bt_init %d %p %p\n", num_callers, p->stab_sym, p->bounds_start), fflush(stderr);
+    //fprintf(stderr, "__bt_init %d %p %p %d\n", num_callers, p->stab_sym, p->bounds_start, mode), fflush(stderr);
     if (num_callers) {
         memcpy(rc, p, offsetof(rt_context, next));
         rc->num_callers = num_callers - 1;
@@ -28,7 +28,7 @@ void __bt_init(rt_context *p, int num_callers)
         p->next = rc->next, rc->next = p;
     }
     if (__bound_init && p->bounds_start)
-        __bound_init(p->bounds_start);
+        __bound_init(p->bounds_start, mode);
 }
 
 /* copy a string and truncate it. */
diff --git a/tccelf.c b/tccelf.c
index 14a60ce7..bddd13ea 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -1388,7 +1388,10 @@ ST_FUNC void tcc_add_btstub(TCCState *s1)
     put_ptr(s1, stab_section->link, 0);
     section_ptr_add(s, 3 * PTR_SIZE);
     /* prog_base */
+#ifndef TCC_TARGET_MACHO
+    /* XXX this relocation is wrong, it uses sym-index 0 (local,undef) */
     put_elf_reloc(s1->symtab, s, s->data_offset, R_DATA_PTR, 0);
+#endif
     section_ptr_add(s, PTR_SIZE);
     n = 2 * PTR_SIZE;
 #ifdef CONFIG_TCC_BCHECK
@@ -1411,11 +1414,11 @@ ST_FUNC void tcc_add_btstub(TCCState *s1)
         cstr_printf(&cstr, "__bt_init_dll(0);");
 #endif
 #endif
-    cstr_printf(&cstr, "__bt_init(__rt_info,%d);}",
+    cstr_printf(&cstr, "__bt_init(__rt_info,%d, 0);}",
         s1->output_type == TCC_OUTPUT_DLL ? 0 : s1->rt_num_callers + 1);
     tcc_compile_string(s1, cstr.data);
     cstr_free(&cstr);
-    set_local_sym(s1, "__rt_info", s, o);
+    set_local_sym(s1, "___rt_info" + !s1->leading_underscore, s, o);
 }
 #endif
 
diff --git a/tccrun.c b/tccrun.c
index 944cd5a3..d0379c9b 100644
--- a/tccrun.c
+++ b/tccrun.c
@@ -172,7 +172,7 @@ LIBTCCAPI int tcc_run(TCCState *s1, int argc, char **argv)
 #ifdef CONFIG_TCC_BCHECK
         if (s1->do_bounds_check) {
             if ((p = tcc_get_symbol(s1, "__bound_init")))
-                ((void(*)(void*))p)(bounds_section->data);
+                ((void(*)(void*, int))p)(bounds_section->data, 1);
         }
 #endif
         set_exception_handler();