@@ -18,23 +18,28 @@ class SalesforceCommitMessage(WriterCommitMessage):
1818
1919class SalesforceDataSource (DataSource ):
2020 """
21- A Salesforce streaming sink for PySpark to write data to Salesforce objects.
21+ A Salesforce streaming datasource for PySpark to write data to Salesforce objects.
2222
23- This data sink enables writing streaming data from Spark to Salesforce using the
23+ This datasource enables writing streaming data from Spark to Salesforce using the
2424 Salesforce REST API. It supports common Salesforce objects like Account, Contact,
2525 Opportunity, and custom objects.
2626
27- Note: This is a write-only sink , not a full bidirectional data source.
27+ Note: This is a write-only datasource , not a full bidirectional data source.
2828
2929 Name: `salesforce`
3030
3131 Notes
3232 -----
3333 - Requires the `simple-salesforce` library for Salesforce API integration
34- - **Write-only sink **: Only supports streaming write operations (no read operations)
34+ - **Write-only datasource **: Only supports streaming write operations (no read operations)
3535 - Uses Salesforce username/password/security token authentication
3636 - Supports batch writing with Salesforce Composite Tree API for efficient processing
3737 - Implements exactly-once semantics through Spark's checkpoint mechanism
38+ - If a streaming write job fails and is resumed from the checkpoint,
39+ it will not overwrite records already written in Salesforce;
40+ it resumes from the last committed offset.
41+ However, if records were written to Salesforce but not yet committed at the time of failure,
42+ duplicate records may occur after recovery.
3843
3944 Parameters
4045 ----------
@@ -57,7 +62,7 @@ class SalesforceDataSource(DataSource):
5762
5863 Examples
5964 --------
60- Register the Salesforce sink :
65+ Register the Salesforce Datasource :
6166
6267 >>> from pyspark_datasources import SalesforceDataSource
6368 >>> spark.dataSource.register(SalesforceDataSource)
@@ -78,9 +83,9 @@ class SalesforceDataSource(DataSource):
7883 ... (col("value") * 100000).cast("double").alias("AnnualRevenue")
7984 ... )
8085 >>>
81- >>> # Write to Salesforce using the sink
86+ >>> # Write to Salesforce using the datasource
8287 >>> query = account_data.writeStream \\
83- ... .format("salesforce") \\
88+ ... .format("pyspark.datasource. salesforce") \\
8489 ... .option("username", "[email protected] ") \\ 8590 ... .option("password", "your-password") \\
8691 ... .option("security_token", "your-security-token") \\
@@ -98,7 +103,7 @@ class SalesforceDataSource(DataSource):
98103 ... )
99104 >>>
100105 >>> query = contact_data.writeStream \\
101- ... .format("salesforce") \\
106+ ... .format("pyspark.datasource. salesforce") \\
102107 ... .option("username", "[email protected] ") \\ 103108 ... .option("password", "your-password") \\
104109 ... .option("security_token", "your-security-token") \\
@@ -114,7 +119,7 @@ class SalesforceDataSource(DataSource):
114119 ... )
115120 >>>
116121 >>> query = custom_data.writeStream \\
117- ... .format("salesforce") \\
122+ ... .format("pyspark.datasource. salesforce") \\
118123 ... .option("username", "[email protected] ") \\ 119124 ... .option("password", "your-password") \\
120125 ... .option("security_token", "your-security-token") \\
@@ -128,7 +133,7 @@ class SalesforceDataSource(DataSource):
128133 >>> contact_schema = "FirstName STRING NOT NULL, LastName STRING NOT NULL, Email STRING, Phone STRING"
129134 >>>
130135 >>> query = contact_data.writeStream \\
131- ... .format("salesforce") \\
136+ ... .format("pyspark.datasource. salesforce") \\
132137 ... .option("username", "[email protected] ") \\ 133138 ... .option("password", "your-password") \\
134139 ... .option("security_token", "your-security-token") \\
@@ -148,7 +153,7 @@ class SalesforceDataSource(DataSource):
148153 ... )
149154 >>>
150155 >>> query = opportunity_data.writeStream \\
151- ... .format("salesforce") \\
156+ ... .format("pyspark.datasource. salesforce") \\
152157 ... .option("username", "[email protected] ") \\ 153158 ... .option("password", "your-password") \\
154159 ... .option("security_token", "your-security-token") \\
@@ -159,7 +164,7 @@ class SalesforceDataSource(DataSource):
159164
160165 Key Features:
161166
162- - **Write-only sink **: Designed specifically for writing data to Salesforce
167+ - **Write-only datasource **: Designed specifically for writing data to Salesforce
163168 - **Batch processing**: Uses Salesforce Composite Tree API for efficient bulk writes
164169 - **Exactly-once semantics**: Integrates with Spark's checkpoint mechanism
165170 - **Error handling**: Graceful fallback to individual record creation if batch fails
@@ -168,8 +173,8 @@ class SalesforceDataSource(DataSource):
168173
169174 @classmethod
170175 def name (cls ) -> str :
171- """Return the short name for this Salesforce sink ."""
172- return "salesforce"
176+ """Return the short name for this Salesforce datasource ."""
177+ return "pyspark.datasource. salesforce"
173178
174179 def schema (self ) -> str :
175180 """
@@ -196,12 +201,12 @@ def schema(self) -> str:
196201 """
197202
198203 def streamWriter (self , schema : StructType , overwrite : bool ) -> "SalesforceStreamWriter" :
199- """Create a stream writer for Salesforce sink integration."""
204+ """Create a stream writer for Salesforce datasource integration."""
200205 return SalesforceStreamWriter (schema , self .options )
201206
202207
203208class SalesforceStreamWriter (DataSourceStreamWriter ):
204- """Stream writer implementation for Salesforce sink integration."""
209+ """Stream writer implementation for Salesforce datasource integration."""
205210
206211 def __init__ (self , schema : StructType , options : Dict [str , str ]):
207212 self .schema = schema
0 commit comments