Skip to content

Reference

This part of the project documentation focuses on an information-oriented approach. Use it as a reference for the technical implementation of the calculator project code.

Chilka is a corpus serving library with a basic sensible interface and a pluggable backend to accomodate different databases.

Chilka implements the following interface:

- `add()`: Add a file to the corpus.
- `remove()`: Remove a file from the corpus.
- `list()`: List files from the corpus.
- `readSents()`: Read sentences of a particular file based on conditions.
- `readBlob()`: Get entire file as a text blob.

The plugin implementation lets you implement and enforce your own schema. The plugin_args argument lets you pass custom arguments to your plugin.

CorpusClient

Bases: CorpusClientAPI

Concrete class implementing the corpus API.

Methods:

Name Description
add

Add a file to the corpus

remove

Remove a file from the corpus

readSents

Read a file stored in the corpus as sentences

readBlob

Read a file stored in the corpus as text blob

list

List the files in the corpus

Source code in chilka.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
class CorpusClient(CorpusClientAPI):
    """Concrete class implementing the corpus API.

    Methods:
        add(): Add a file to the corpus
        remove(): Remove a file from the corpus
        readSents(): Read a file stored in the corpus as sentences
        readBlob(): Read a file stored in the corpus as text blob
        list(): List the files in the corpus
    """

    def __init__(self,db_name:str,connection_string:str,db_plugin=None,plugin_args={}):
        """Init method to accept database details.

        Args:
            db_name (str): The name of the database/corpus
            connection_string (str): The address & port of the database server
            db_plugin (str): The name of the database plugin
        Returns:
            A corpus client object
        """

        # Load the plugin
        plugin_path = "plugins.chilka_" + db_plugin
        plugin = importlib.import_module(plugin_path)

        # Instantiate the plugin client
        self.pu_client = plugin.CorpusClientImpl(db_name, connection_string,
                                               plugin_args=plugin_args)


    def add(self,filepath:str,plugin_args={}) -> list:
        """Adds a file to the file list

        Args:
            filepath (str): The path of the file to add to the corpus
        Returns:
            list: The list of IDs of objects added

        """

        #-----
        return self.pu_client.add_impl(filepath, plugin_args=plugin_args)
        #-----


    def remove(self,filename:str,plugin_args={}) -> bool:
        """Removes a file from the corpus

        Args:
            filename (str): The name of the file to be removed from the corpus
        Returns:
            bool: True if the collection was removed successfully, false if it
            does not exist
        """

        #------
        return self.pu_client.remove_impl(filename,plugin_args=plugin_args)
        #------


    def readSents(self,filename:str,range_filter=None,kw_filter=None,plugin_args={}) -> Iterator:
        """Returns a file as an iterator of {n:<>,sent:<>} dictionaries

        Args:
            filename (str): The name of the file/collection to be read
            range_filter (tuple): (optional)Range of lines to read
            kw_filter (str): (optional)Search term to return sentences containing it
        returns:
            iterator: An iterator of dictionaries containing sentences from the file
            with serial number starting from 1
        """

        return self.pu_client.readSents_impl(filename,range_filter=range_filter,
                                             kw_filter=kw_filter,
                                             plugin_args=plugin_args)


    def readBlob(self,filename:str,plugin_args={}) -> str:
        """Reads a file as a text blob

        Args:
            filename (str): The name of the file to be read
        returns:
            str: File content as a single string
        """

        return self.pu_client.readBlob_impl(filename, plugin_args=plugin_args)


    def list(self,plugin_args={}) -> list:
        """List the files in the corpus

        Args:
            None
        returns:
            list (str): A list containing filenames in the corpus
        """
        #return self.db.list_collection_names()
        # Use the plugin reference to get list of filenames

        return self.pu_client.list_impl(plugin_args = plugin_args)

__init__(db_name, connection_string, db_plugin=None, plugin_args={})

Init method to accept database details.

Parameters:

Name Type Description Default
db_name str

The name of the database/corpus

required
connection_string str

The address & port of the database server

required
db_plugin str

The name of the database plugin

None

Returns: A corpus client object

Source code in chilka.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def __init__(self,db_name:str,connection_string:str,db_plugin=None,plugin_args={}):
    """Init method to accept database details.

    Args:
        db_name (str): The name of the database/corpus
        connection_string (str): The address & port of the database server
        db_plugin (str): The name of the database plugin
    Returns:
        A corpus client object
    """

    # Load the plugin
    plugin_path = "plugins.chilka_" + db_plugin
    plugin = importlib.import_module(plugin_path)

    # Instantiate the plugin client
    self.pu_client = plugin.CorpusClientImpl(db_name, connection_string,
                                           plugin_args=plugin_args)

add(filepath, plugin_args={})

Adds a file to the file list

Parameters:

Name Type Description Default
filepath str

The path of the file to add to the corpus

required

Returns: list: The list of IDs of objects added

Source code in chilka.py
165
166
167
168
169
170
171
172
173
174
175
176
def add(self,filepath:str,plugin_args={}) -> list:
    """Adds a file to the file list

    Args:
        filepath (str): The path of the file to add to the corpus
    Returns:
        list: The list of IDs of objects added

    """

    #-----
    return self.pu_client.add_impl(filepath, plugin_args=plugin_args)

list(plugin_args={})

List the files in the corpus

returns: list (str): A list containing filenames in the corpus

Source code in chilka.py
224
225
226
227
228
229
230
231
232
233
234
235
def list(self,plugin_args={}) -> list:
    """List the files in the corpus

    Args:
        None
    returns:
        list (str): A list containing filenames in the corpus
    """
    #return self.db.list_collection_names()
    # Use the plugin reference to get list of filenames

    return self.pu_client.list_impl(plugin_args = plugin_args)

readBlob(filename, plugin_args={})

Reads a file as a text blob

Parameters:

Name Type Description Default
filename str

The name of the file to be read

required

returns: str: File content as a single string

Source code in chilka.py
212
213
214
215
216
217
218
219
220
221
def readBlob(self,filename:str,plugin_args={}) -> str:
    """Reads a file as a text blob

    Args:
        filename (str): The name of the file to be read
    returns:
        str: File content as a single string
    """

    return self.pu_client.readBlob_impl(filename, plugin_args=plugin_args)

readSents(filename, range_filter=None, kw_filter=None, plugin_args={})

Returns a file as an iterator of {n:<>,sent:<>} dictionaries

Parameters:

Name Type Description Default
filename str

The name of the file/collection to be read

required
range_filter tuple

(optional)Range of lines to read

None
kw_filter str

(optional)Search term to return sentences containing it

None

returns: iterator: An iterator of dictionaries containing sentences from the file with serial number starting from 1

Source code in chilka.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def readSents(self,filename:str,range_filter=None,kw_filter=None,plugin_args={}) -> Iterator:
    """Returns a file as an iterator of {n:<>,sent:<>} dictionaries

    Args:
        filename (str): The name of the file/collection to be read
        range_filter (tuple): (optional)Range of lines to read
        kw_filter (str): (optional)Search term to return sentences containing it
    returns:
        iterator: An iterator of dictionaries containing sentences from the file
        with serial number starting from 1
    """

    return self.pu_client.readSents_impl(filename,range_filter=range_filter,
                                         kw_filter=kw_filter,
                                         plugin_args=plugin_args)

remove(filename, plugin_args={})

Removes a file from the corpus

Parameters:

Name Type Description Default
filename str

The name of the file to be removed from the corpus

required

Returns: bool: True if the collection was removed successfully, false if it does not exist

Source code in chilka.py
180
181
182
183
184
185
186
187
188
189
190
191
def remove(self,filename:str,plugin_args={}) -> bool:
    """Removes a file from the corpus

    Args:
        filename (str): The name of the file to be removed from the corpus
    Returns:
        bool: True if the collection was removed successfully, false if it
        does not exist
    """

    #------
    return self.pu_client.remove_impl(filename,plugin_args=plugin_args)

CorpusClientAPI

Abstract base class defining the corpus API.

Methods:

Name Description
add

Add a file to the corpus

remove

Remove a file from the corpus

readSents

Read a file stored in the corpus as sentences

readBlob

Read a file stored in the corpus as text blob

list

List the files in the corpus

Source code in chilka.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
class CorpusClientAPI(metaclass=abc.ABCMeta):
    """Abstract base class defining the corpus API.

    Methods:
        add(): Add a file to the corpus
        remove(): Remove a file from the corpus
        readSents(): Read a file stored in the corpus as sentences
        readBlob(): Read a file stored in the corpus as text blob
        list(): List the files in the corpus
    """
    @abc.abstractmethod
    def __init__(self,db_name:str,connection_string:str,db_plugin=None,plugin_args={}):
        """Init method to accept database details.

        Args:
            db_name (str): The name of the database/corpus
            connection_string (str): The address & port of the database server
            db_plugin (str): The name of the database plugin
        Returns:
            A corpus client object
        """

        raise NotImplementedError

    @abc.abstractmethod
    def add(self,filepath:str,plugin_args={}) -> list:
        """Adds a file to the file list

        Args:
            filepath (str): The path of the file to add to the corpus
        Returns:
            list: The list of IDs of objects added

        """

        raise NotImplementedError

    @abc.abstractmethod
    def remove(self,filename:str,plugin_args={}) -> bool:
        """Removes a file from the corpus

        Args:
            filename (str): The name of the file to be removed from the corpus
        Returns:
            bool: True if the collection was removed successfully, false if it
            does not exist
        """

        raise NotImplementedError

    @abc.abstractmethod
    def readSents(self,filename:str,range_filter:tuple=None,kw_filter:str=None,plugin_args={}) -> Iterator:
        """Returns a file as an iterator of {n:<>,sent:<>} dictionaries

        Args:
            filename (str): The name of the file to be read
            range_filter (tuple): (optional)Range of lines to read
            kw_filter (str): (optional)Search term to return sentences containing it
        returns:
            iterator: An iterator of dictionaries containing sentences from the file
            with serial number starting from 1
        """

        raise NotImplementedError

    @abc.abstractmethod
    def readBlob(self,filename:str,plugin_args={}) -> str:
        """Reads a file as a text blob

        Args:
            filename (str): The name of the file to be read
        returns:
            str: File content as a single string
        """

        raise NotImplementedError

    @abc.abstractmethod
    def list(self,plugin_args={}) -> list:
        """List the files in the corpus

        Args:
            None
        returns:
            list (str): A list containing filenames in the corpus
        """

        raise NotImplementedError

__init__(db_name, connection_string, db_plugin=None, plugin_args={}) abstractmethod

Init method to accept database details.

Parameters:

Name Type Description Default
db_name str

The name of the database/corpus

required
connection_string str

The address & port of the database server

required
db_plugin str

The name of the database plugin

None

Returns: A corpus client object

Source code in chilka.py
54
55
56
57
58
59
60
61
62
63
64
65
66
@abc.abstractmethod
def __init__(self,db_name:str,connection_string:str,db_plugin=None,plugin_args={}):
    """Init method to accept database details.

    Args:
        db_name (str): The name of the database/corpus
        connection_string (str): The address & port of the database server
        db_plugin (str): The name of the database plugin
    Returns:
        A corpus client object
    """

    raise NotImplementedError

add(filepath, plugin_args={}) abstractmethod

Adds a file to the file list

Parameters:

Name Type Description Default
filepath str

The path of the file to add to the corpus

required

Returns: list: The list of IDs of objects added

Source code in chilka.py
68
69
70
71
72
73
74
75
76
77
78
79
@abc.abstractmethod
def add(self,filepath:str,plugin_args={}) -> list:
    """Adds a file to the file list

    Args:
        filepath (str): The path of the file to add to the corpus
    Returns:
        list: The list of IDs of objects added

    """

    raise NotImplementedError

list(plugin_args={}) abstractmethod

List the files in the corpus

returns: list (str): A list containing filenames in the corpus

Source code in chilka.py
121
122
123
124
125
126
127
128
129
130
131
@abc.abstractmethod
def list(self,plugin_args={}) -> list:
    """List the files in the corpus

    Args:
        None
    returns:
        list (str): A list containing filenames in the corpus
    """

    raise NotImplementedError

readBlob(filename, plugin_args={}) abstractmethod

Reads a file as a text blob

Parameters:

Name Type Description Default
filename str

The name of the file to be read

required

returns: str: File content as a single string

Source code in chilka.py
109
110
111
112
113
114
115
116
117
118
119
@abc.abstractmethod
def readBlob(self,filename:str,plugin_args={}) -> str:
    """Reads a file as a text blob

    Args:
        filename (str): The name of the file to be read
    returns:
        str: File content as a single string
    """

    raise NotImplementedError

readSents(filename, range_filter=None, kw_filter=None, plugin_args={}) abstractmethod

Returns a file as an iterator of {n:<>,sent:<>} dictionaries

Parameters:

Name Type Description Default
filename str

The name of the file to be read

required
range_filter tuple

(optional)Range of lines to read

None
kw_filter str

(optional)Search term to return sentences containing it

None

returns: iterator: An iterator of dictionaries containing sentences from the file with serial number starting from 1

Source code in chilka.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
@abc.abstractmethod
def readSents(self,filename:str,range_filter:tuple=None,kw_filter:str=None,plugin_args={}) -> Iterator:
    """Returns a file as an iterator of {n:<>,sent:<>} dictionaries

    Args:
        filename (str): The name of the file to be read
        range_filter (tuple): (optional)Range of lines to read
        kw_filter (str): (optional)Search term to return sentences containing it
    returns:
        iterator: An iterator of dictionaries containing sentences from the file
        with serial number starting from 1
    """

    raise NotImplementedError

remove(filename, plugin_args={}) abstractmethod

Removes a file from the corpus

Parameters:

Name Type Description Default
filename str

The name of the file to be removed from the corpus

required

Returns: bool: True if the collection was removed successfully, false if it does not exist

Source code in chilka.py
81
82
83
84
85
86
87
88
89
90
91
92
@abc.abstractmethod
def remove(self,filename:str,plugin_args={}) -> bool:
    """Removes a file from the corpus

    Args:
        filename (str): The name of the file to be removed from the corpus
    Returns:
        bool: True if the collection was removed successfully, false if it
        does not exist
    """

    raise NotImplementedError