/* **********************************************************
 * Copyright 2000 VMware, Inc.  All rights reserved.
 * **********************************************************/

/* 
 * os.c --
 *
 * 	Wrappers for Linux system functions required by "vmmemctl".
 *	This allows customers to build their own vmmemctl driver for
 *	custom versioned kernels without the need for source code.
 */

/*
 * Compile-Time Options
 */

#define	OS_DISABLE_UNLOAD	(0)
#define	OS_DEBUG		(1)

/*
 * Includes
 */

#include "driver-config.h"

#include <linux/config.h>

#ifdef	MODULE
#include <linux/module.h>
#endif	/* MODULE */

#include <linux/types.h>
#include <linux/kernel.h>
#include "compat_mm.h"
#include <linux/fs.h>
#include <linux/timer.h>
#include <linux/interrupt.h>
#include "compat_sched.h"
#include <asm/uaccess.h>
#include "compat_page.h"
#include "compat_wait.h"

#ifdef	CONFIG_PROC_FS
#include <linux/stat.h>
#include <linux/proc_fs.h>
#endif	/* CONFIG_PROC_FS */

#if	LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
#include <linux/smp_lock.h>

/*
 * Compatibility definitions.
 */

/*
 * Execute as a separate kernel thread on 2.4.x kernels.
 * Allow allocations from high memory  on 2.4.x kernels.
 */
#define	OS_KTHREAD	(1)
#endif

#include "os.h"


/*
 * Constants
 */

#ifdef	OS_KTHREAD
/*
 * Use GFP_HIGHUSER when executing in a separate kernel thread 
 * context.  This is less stressful to the guest memory system, 
 * since it allows the thread to block while memory is reclaimed,
 * and won't take pages from emergency low-memory pools.
 */

#define	OS_GFP_PRIORITY		GFP_HIGHUSER

#else
/*
 * Unable to block if not executing in a separate kernel thread
 * context, so use __GFP_LOW when available (2.2.x kernels) to
 * avoid stressing the guest memory system, otherwise simply use
 * GFP_ATOMIC, which is always defined (normally as __GFP_HIGH).
 */
#ifdef	__GFP_LOW
#define	OS_GFP_PRIORITY		__GFP_LOW
#else
#define	OS_GFP_PRIORITY		GFP_ATOMIC
#endif

#endif

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 5, 41)
#define OS_USE_SCHEDULE_DELAYED_WORK
#else
#undef OS_USE_SCHEDULE_DELAYED_WORK
#endif

/*
 * Types
 */

typedef struct {
   /* registered state */
   os_timer_handler handler;
   void *data;
   int period;

   /* termination flag */
   volatile int stop;

   /* system structures */
#ifdef	OS_KTHREAD   
   wait_queue_head_t delay;
   struct semaphore notifyStart;
   struct semaphore notifyStop;
   pid_t pid;
#else
#ifdef OS_USE_SCHEDULE_DELAYED_WORK
   struct work_struct work;
#else
   struct timer_list timer;
   struct tq_struct task;
#endif
#endif
} os_timer;

typedef struct {
   /* registered state */
   os_status_handler handler;
   const char *name_verbose;
   const char *name;
} os_status;

typedef struct {
   os_status status;
   os_timer timer;
   unsigned int totalMemoryPages;
} os_state;

/*
 * Globals
 */

#ifdef	CONFIG_PROC_FS
#if	LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
static int os_proc_read(char *, char **, off_t, int);
static struct proc_dir_entry *global_proc_entry;
#else
static int os_proc_read(char *, char **, off_t, int, int);
static struct proc_dir_entry global_proc_entry = {
   0, 8, "vmmemctl", S_IFREG | S_IRUGO, 1, 0, 0, 0, NULL, os_proc_read,
};
#endif
#endif	/* CONFIG_PROC_FS */

static os_state global_state;

/*
 * Simple Wrappers
 */

void * CDECL
os_kmalloc_atomic(unsigned int size)
{
   return(kmalloc(size, GFP_ATOMIC));
}

void CDECL
os_kfree(void *obj, unsigned int size)
{
   kfree(obj);
}

void CDECL
os_bzero(void *s, unsigned int n)
{
   memset(s, 0, n);
}

void CDECL
os_memcpy(void *dest, const void *src, unsigned int size)
{
   memcpy(dest, src, size);
}

int CDECL
os_sprintf(char *str, const char *format, ...)
{
   va_list args;
   va_start(args, format);
   return(vsprintf(str, format, args));
}

/*
 * System-Dependent Operations
 */

char * CDECL
os_identity(void)
{
   return("linux");
}

/*
 * Predict the maximum achievable balloon size.
 *
 * In 2.4.x and 2.6.x kernels, the balloon driver can guess the number of pages
 * that can be ballooned. But, for now let us just pass the totalram-size as the 
 * maximum achievable balloon size. Note that normally (unless guest kernel is
 * booted with a mem=XX parameter) the totalram-size is equal to alloc.max.
 *
 * Returns the maximum achievable balloon size in pages
 */  
unsigned int CDECL
os_predict_max_balloon_pages(void)
{
   struct sysinfo info;
   os_state *state = &global_state;

#if	LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)

   /* 
    * In 2.4.0 and later, si_meminfo() is cheap. Moreover, we want to provide
    * dynamic max balloon size later. So let us call si_meminfo() every 
    * iteration. 
    */
   si_meminfo(&info);
   
   /* In 2.4.x and later kernels, info.totalram is in pages */
   state->totalMemoryPages = info.totalram;
   return(state->totalMemoryPages);

#else 

   /* 2.2.x kernel */
   if (!state->totalMemoryPages) {
      si_meminfo(&info); /* In 2.2.x, si_meminfo() is a costly operation */
      /* In 2.2.x kernels, info.totalram is in bytes */
      state->totalMemoryPages = info.totalram >> PAGE_SHIFT;
   }
   return(state->totalMemoryPages);

#endif
}

/*
 * Use newer alloc_page() interface on 2.4.x kernels.
 * Use "struct page *" value as page handle for clients.
 */
unsigned long CDECL
os_addr_to_ppn(unsigned long addr)
{
   struct page *page = (struct page *) addr;
   return(page_to_pfn(page));
}

unsigned long CDECL
os_alloc_reserved_page(void)
{
   /* allocate page */
   struct page *page = alloc_page(OS_GFP_PRIORITY);
   return((unsigned long) page);
}

void CDECL
os_free_reserved_page(unsigned long addr)
{
   /* deallocate page */
   struct page *page = (struct page *) addr;
   __free_page(page);
}

#ifndef	OS_KTHREAD
static void os_timer_add(os_timer *t);

static void os_timer_bh(void *data)
{
   os_timer *t = (os_timer *) data;

   if (!t->stop) {
      /* execute registered handler, rearm timer */
      (*(t->handler))(t->data);
      os_timer_add(t);
   }
}

#ifndef OS_USE_SCHEDULE_DELAYED_WORK
static void os_timer_internal(ulong data)
{
   os_timer *t = (os_timer *) data;

   /* perform real work in registered bottom-half handler */
   queue_task(&t->task, &tq_immediate);
   mark_bh(IMMEDIATE_BH);
}
#endif

static void os_timer_add(os_timer *t)
{
#ifdef OS_USE_SCHEDULE_DELAYED_WORK
   schedule_delayed_work(&t->work, t->period);
#else
   /* schedule timer callback */
   struct timer_list *timer = &t->timer;
   timer->expires = jiffies + t->period;
   add_timer(timer);
#endif
}

static void os_timer_remove(os_timer *t)
{
#ifdef OS_USE_SCHEDULE_DELAYED_WORK
   cancel_delayed_work(&t->work);
   flush_scheduled_work();
#else
   /* deschedule timer callback */
   struct timer_list *timer = &t->timer;
   (void) del_timer(timer);
#endif
}
#endif

void CDECL
os_timer_init(os_timer_handler handler, void *data, int period)
{
   os_timer *t = &global_state.timer;
   t->handler = handler;
   t->data = data;
   t->period = period;
   t->stop = 0;
#ifndef OS_KTHREAD
#ifdef OS_USE_SCHEDULE_DELAYED_WORK
   INIT_WORK(&t->work, os_timer_bh, t);
#else
   t->task.routine = os_timer_bh;
   t->task.data = t;
   /* initialize timer state */
   init_timer(&t->timer);
   t->timer.function = os_timer_internal;
   t->timer.data = (ulong) t;
#endif
#endif
}

#ifdef	OS_KTHREAD
static int os_timer_thread_loop(void *data)
{
   os_timer *t = (os_timer *) data;

   /* detach thread */
   lock_kernel();
   compat_daemonize("vmmemctl");
   unlock_kernel();
   
   /* block all signals */
   spin_lock_irq(&current->compat_sigmask_lock);
   sigfillset(&current->blocked);
   spin_unlock_irq(&current->compat_sigmask_lock);
   compat_flush_signals(current);

   /* we are running */
   up(&t->notifyStart);

   /* main loop */
   while (1) {
      /* sleep for specified period */
      wait_event_interruptible_timeout(t->delay, t->stop, t->period);
      if (t->stop) {
         break;
      }

      /* ignore unexpected signals */
      if (signal_pending(current)) {
         compat_flush_signals(current);
      }

      /* execute registered handler */
      (*(t->handler))(t->data);
   }

   /* terminate */
   up(&t->notifyStop);
   return(0);
}

static int os_timer_thread_start(os_timer *t)
{
   os_status *s = &global_state.status;

   /* initialize sync objects */
   init_MUTEX_LOCKED(&t->notifyStart);
   init_MUTEX_LOCKED(&t->notifyStop);   
   init_waitqueue_head(&t->delay);

   /* create kernel thread */
   t->pid = kernel_thread(os_timer_thread_loop, t, 0);
   if (t->pid < 0) {
      /* fail */
      printk(KERN_WARNING "%s: unable to create kernel thread (%d)\n", s->name, t->pid);
      return(-1);
   }

   if (OS_DEBUG) {
      printk(KERN_DEBUG "%s: started kernel thread pid=%d\n", s->name, t->pid);
   }

   /* block until started... Why?! */
   down(&t->notifyStart);
   return(0);
}

static void os_timer_thread_stop(os_timer *t)
{
   wake_up_interruptible(&t->delay);
   down(&t->notifyStop);
}
#endif

void CDECL
os_timer_start(void)
{
   os_timer *t = &global_state.timer;

   /* clear termination flag */
   t->stop = 0;

#ifdef	OS_KTHREAD
   os_timer_thread_start(t);
#else
   os_timer_add(t);
#endif
}

void CDECL
os_timer_stop(void)
{
   os_timer *t = &global_state.timer;

   /* set termination flag */
   t->stop = 1;

#ifdef	OS_KTHREAD
   os_timer_thread_stop(t);
#else
   os_timer_remove(t);
#endif
}

unsigned int CDECL
os_timer_hz(void)
{
   return HZ;
}

#ifdef	CONFIG_PROC_FS
#if	LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
static int os_proc_read(char *buf,
                        char **start,
                        off_t offset,
                        int length)
#else
static int os_proc_read(char *buf,
                        char **start,
                        off_t offset,
                        int length,
                        int unused)
#endif
{
   os_status *s = &global_state.status;

   /* done if no handler */
   if (s->handler == NULL) {
      return(0);
   }

   /* invoke registered handler */
   return(s->handler(buf));
}
#endif

void CDECL
os_init(const char *name,
        const char *name_verbose,
        os_status_handler handler)
{
   os_state *state = &global_state;
   static int initialized = 0;

   /* initialize only once */
   if (initialized++) {
      return;
   }

   /* prevent module unload with extra reference */
   if (OS_DISABLE_UNLOAD) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 5, 48)
      MOD_INC_USE_COUNT;
#else
      try_module_get(THIS_MODULE);
#endif
   }

   /* zero global state */
   memset(state, 0, sizeof(global_state));

   /* initialize status state */
   state->status.handler = handler;
   state->status.name = name;
   state->status.name_verbose = name_verbose;

#ifdef	CONFIG_PROC_FS
   /* register procfs device */
#if	LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
   global_proc_entry = create_proc_entry("vmmemctl", S_IFREG | S_IRUGO, NULL);
   if (global_proc_entry != NULL) {
      global_proc_entry->get_info = os_proc_read;
   }
#else
   proc_register(&proc_root, &global_proc_entry);
#endif
#endif	/* CONFIG_PROC_FS */

   /* log device load */
   printk(KERN_INFO "%s initialized\n", state->status.name_verbose);
}

void CDECL
os_cleanup(void)
{
   os_status *s = &global_state.status;
   int err;

#ifdef	CONFIG_PROC_FS
   /* unregister procfs entry */
#if	LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
   remove_proc_entry("vmmemctl", NULL);
   err = 0;
#else
   if ((err = proc_unregister(&proc_root, global_proc_entry.low_ino)) != 0) {
      printk(KERN_WARNING "%s: unable to unregister procfs entry (%d)\n", s->name, err);
   }
#endif
#endif	/* CONFIG_PROC_FS */

   /* log device unload */
   printk(KERN_INFO "%s unloaded\n", s->name_verbose);
}

